scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30931B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fara-7B: An Efficient Agentic Model for Computer Use",
      6     "authors": [
      7       "Ahmed Awadallah",
      8       "Yash Lara",
      9       "Raghav Magazine",
     10       "Hussein Mozannar",
     11       "Akshay Nambi",
     12       "Yash Pandya",
     13       "Aravind Rajeswaran",
     14       "Corby Rosset",
     15       "Alexey Taymanov",
     16       "Vibhav Vineet",
     17       "Spencer Whitehead",
     18       "Andrew Zhao"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2511.19663",
     23     "doi": "10.48550/arXiv.2511.19663"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All abstract claims are supported: Fara-7B outperforms comparable models on WebVoyager (73.5% vs UI-TARS 66.4%), Online-Mind2Web (34.1% vs 31.3%), and WebTailBench (38.4% vs 19.5%), and FaraGen achieves ~$1 per trajectory as shown in Table 6.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 4 provides cumulative ablations of task-solving pipeline components showing causal contributions of each modification; Section 5.3 shows data scaling ablations from 18K to 1.8M action steps demonstrating causal effect of data quantity on performance.",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Claims are explicitly bounded to web-based CUA tasks; the limitations section acknowledges specific constraints (no drag-and-drop, no video/audio, reduced accuracy on complex tasks), and Discussion frames contributions within the web CUA domain.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper attributes Fara-7B's superior performance over UI-TARS entirely to FaraGen data quality without considering alternatives such as differences in fine-tuning procedures, data mixture ratios, or domain-specific benchmark optimization.",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5.1.2 explicitly acknowledges the gap between LLM-as-a-judge metrics and human evaluation (62% vs higher auto-eval scores), and calls for improved LLM-as-a-judge frameworks, demonstrating awareness of proxy limitations.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "A dedicated 'Limitations' paragraph appears in Section 7 (Discussion), covering action space limitations, reduced accuracy on complex tasks, susceptibility to hallucinations, and the incomplete framework for human-agent collaboration.",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The limitations section lists generic model constraints (no drag-and-drop, no audio/video) rather than specific threats to validity; concerns about LLM-as-a-judge reliability, train/test domain overlap, and benchmark-specific optimization are not addressed as validity threats.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper explicitly bounds scope to web-based CUA tasks, notes Fara-7B is an 'experimental preview' not recommended for commercial or high-stakes applications, and provides specific use guidelines requiring sandboxed environments and human oversight.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding disclosure section is present in the paper; all authors are from Microsoft and this is a Microsoft Research product, but no formal funding statement appears.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Microsoft affiliation is clearly evident through GitHub (github.com/microsoft/fara), HuggingFace (huggingface.co/microsoft/fara-7b), Azure Foundry links in the paper header, and references to 'Microsoft Responsible AI Policy'.",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All authors are Microsoft employees evaluating their own model (Fara-7B) and comparing against competing products; there is no independence between the funder/employer and the outcome being evaluated.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are defined: 'Computer Use Agents (CUAs)' described in the introduction, 'Critical Points' explicitly defined in Section 2.2 with examples, 'pixel-in, action-out' formulation described in Section 3.1, and 'SoM Agents' explained in Section 5.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The Contributions section explicitly lists three contributions: FaraGen (scalable synthetic data engine), Fara-7B (compact CUA model), and WebTailBench (new benchmark), each with clear descriptions of what they add.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6 provides substantive related work covering tool-calling LLMs, multimodality, CUA models, and benchmarks, explaining how Fara-7B relates to and differs from prior approaches like UI-TARS, WebArena, and Mind2Web.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Source code is available at https://github.com/microsoft/fara; model weights are released on HuggingFace (huggingface.co/microsoft/fara-7b) and Azure Foundry, and an inference harness is mentioned as released.",
    132           "source": "haiku"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "WebTailBench (609 tasks) and the Task Verification system are being released; evaluation uses public benchmarks (WebVoyager, Online-Mind2Web, DeepShop); however the 145K FaraGen training trajectories central to the paper's claims are not released.",
    138           "source": "haiku"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Appendix C provides hyperparameters and mentions Playwright, Browserbase, and Azure Machine Learning, but no Dockerfile, requirements.txt, or complete dependency specification is provided for reproduction.",
    144           "source": "haiku"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "An inference harness is mentioned as released on GitHub, but step-by-step instructions for reproducing training or full evaluation results are not provided in the paper; training trajectory data is also unavailable.",
    150           "source": "haiku"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Table 19 (appendix) reports mean ± standard deviation across 3 independent evaluation runs for all models on all four benchmarks; Figure 1 and Figure 6 show pass@k curves providing additional variance context.",
    158           "source": "haiku"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No formal statistical significance tests are applied to comparative claims; the paper reports means and standard deviations across 3 runs but does not perform hypothesis testing to confirm that differences are statistically significant.",
    164           "source": "haiku"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Raw accuracy differences with baseline context are reported throughout (e.g., Fara-7B 73.5% vs UI-TARS 66.4% on WebVoyager; 38.4% vs 19.5% on WebTailBench; cost $0.025 vs $0.30+ for proprietary agents).",
    170           "source": "haiku"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The number of benchmark tasks and 3 evaluation runs are not statistically justified; 609 WebTailBench tasks and 3 independent runs are chosen without power analysis or justification for providing reliable performance estimates.",
    176           "source": "haiku"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 19 reports standard deviations for all models across 3 runs (e.g., Fara-7B: 73.5±1.0 on WebVoyager, 38.4±0.7 on WebTailBench); Tables 10 and 12 report standard deviations for per-task token and action counts.",
    182           "source": "haiku"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Multiple baselines are included covering both paradigms: SoM agents (GPT-4o, o3, GPT-5), GLM-4.1V-9B-Thinking, OpenAI computer-use-preview, and UI-TARS-1.5-7B (same base model as Fara-7B).",
    190           "source": "haiku"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "All baselines are from 2024-2025 (UI-TARS January 2025, GPT-5 and o3 accessed October-November 2025, OpenAI computer-use-preview contemporary), making them current with Fara-7B's development.",
    196           "source": "haiku"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Table 4 provides cumulative ablations of task-solving pipeline modifications on WebVoyager; Section 5.3 and Figure 7 show data scaling (1%, 10%, 100% of data) and inference step scaling ablations.",
    202           "source": "haiku"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Evaluation uses four task benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench), grounding benchmarks (ScreenSpot V1/V2), safety benchmarks (AgentHarm-Chat, WebTailBench-Refusals), and efficiency metrics (cost, tokens, actions per task).",
    208           "source": "haiku"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 5.1.2 reports third-party human evaluation by Browserbase where annotators independently verified Fara-7B trajectories on WebVoyager tasks, establishing 62% accuracy versus higher LLM-judge scores.",
    214           "source": "haiku"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "WebTailBench (609 tasks) serves as a held-out evaluation set not used for training; existing public benchmarks (WebVoyager, Online-Mind2Web, DeepShop) are also independent test sets.",
    220           "source": "haiku"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Table 11 provides per-category WebTailBench results across all 11 segments (Shopping, Flights, Hotels, Restaurants, Activities, Ticketing, Real-Estate, Jobs/Careers, Shopping List, Comparison Shopping, Compositional Tasks) for all models.",
    226           "source": "haiku"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Section 5.4 describes the 4 specific cases where Fara-7B failed to stop before critical points (marking email read, liking a post, publishing a post without confirmation); Table 2 shows failure rates by task segment; WebSurfer loop failures are analyzed quantitatively.",
    232           "source": "haiku"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "The paper reports poor real-estate task performance (23.6%, lowest category), 4/23 critical point failures, low trajectory yield for difficult segments (3% for flights without Browserbase), and weaker compositional task performance relative to frontier models.",
    238           "source": "haiku"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Model versions are specified: Qwen2.5-VL-7B as base model, GPT-4o (Hurst et al., 2024), o3 and GPT-5 with system cards cited, UI-TARS-1.5-7B (Qin et al., 2025); OpenAI models noted as accessed in October and November 2025.",
    246           "source": "haiku"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "The paper states they 'retain the same prompts... published with each benchmark' for evaluation but does not reproduce actual prompts; data generation prompts are described at a high level without full text.",
    252           "source": "haiku"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Appendix C provides full training hyperparameters: AdamW with β1=0.9, β2=0.95, cosine LR warmup, initial LR 5e-6, gradient clipping max 1, 2 epochs (~28k iterations), batch size 128, 64 H100 GPUs, DeepSpeed Stage 3, bf16 precision.",
    258           "source": "haiku"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The full Orchestrator-WebSurfer scaffolding is described in detail including the ledger system (Table 1), stopping logic (Table 3), UserSimulator behavior, Trajectory Verification pipeline with three complementary verifiers, and Fara-7B's inference-time formulation (Section 3.1).",
    264           "source": "haiku"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Data preprocessing is documented: SoM element IDs replaced with bounding box center coordinates; data mixing ratios shown in Table 16 (1.2M trajectory steps, 562K grounding, 3K refusals, 1.8K UI VQA/captioning); upsampling of longer trajectories described.",
    270           "source": "haiku"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "The 145K FaraGen training trajectories are not publicly released; only WebTailBench (609 tasks) and the verification system are being released, making independent verification of training data quality impossible.",
    278           "source": "haiku"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "The full FaraGen data collection pipeline is described in detail in Section 2, including three task proposal strategies, multi-agent task solving architecture, and three-verifier trajectory filtering with agreement statistics (83.3% with human judgments, 16.7% false positive rate).",
    284           "source": "haiku"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "Standard benchmark evaluation with automated and third-party human verification; no participant recruitment for a primary study.",
    290           "source": "haiku"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "The complete data pipeline from URL seed selection through task proposal, solving, verification, and filtering is documented in Sections 2.1-2.4 with funnel statistics at each stage (Table 2 shows error rates, completion rates, and verification success rates per task segment).",
    296           "source": "haiku"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The training data cutoff for the Qwen2.5-VL-7B base model is not stated; FaraGen data collection dates are also unspecified, leaving uncertainty about whether benchmark examples appeared in base model pretraining.",
    304           "source": "haiku"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "The paper does not discuss potential overlap between FaraGen's training URLs (ClueWeb22, Tranco web corpus) and benchmark test websites (WebVoyager, Online-Mind2Web domains), despite both drawing from the same live web ecosystem.",
    310           "source": "haiku"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "Neither the base model (Qwen2.5-VL) contamination on benchmark examples nor potential domain overlap between FaraGen training sites and WebVoyager/Mind2Web test sites is discussed.",
    316           "source": "haiku"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participant study; third-party human evaluation by Browserbase is a quality verification exercise, not a controlled study.",
    324           "source": "haiku"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participant study requiring IRB approval.",
    330           "source": "haiku"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in the study; annotator demographics not applicable.",
    336           "source": "haiku"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participant study.",
    342           "source": "haiku"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participant study requiring randomization.",
    348           "source": "haiku"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participant study requiring blinding.",
    354           "source": "haiku"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participant study.",
    360           "source": "haiku"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Table 10 reports per-task cost for Fara-7B ($0.025 on WebVoyager) and all baselines; Table 12 shows per-task cost on WebTailBench ($0.069); cost components (input/output tokens with per-token pricing) are detailed in Appendix A.",
    368           "source": "haiku"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": true,
    373           "justification": "Training used 64 H100 GPUs for ~28K iterations (2 epochs); data generation cost estimated in Table 6 ($0.59-$1.08 per trajectory); data generation infrastructure described as 40 Azure ML nodes running 4 browsers each (600 trajectories/hour throughput).",
    374           "source": "haiku"
    375         }
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "Fara-7B achieves 73.5% on WebVoyager, outperforming all other 7B-scale CUA models and larger systems including OpenAI computer-use-preview (70.9%)",
    382       "evidence": "Table 9 shows Fara-7B (73.5%) vs UI-TARS-1.5-7B (66.4%), OpenAI computer-use-preview (70.9%), SoM GPT-4o (65.1%); Table 19 shows 73.5±1.0 across 3 independent runs",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "FaraGen generates verified web trajectories at approximately $1 per task using premium models, enabling large-scale CUA data creation",
    387       "evidence": "Table 6 shows costs of $0.59 (o4-mini), $1.08 (o3), $1.00 (GPT-5) per trajectory; 145K trajectories generated at this cost spanning 70K unique domains",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Fara-7B achieves a new Pareto frontier of accuracy vs. cost at $0.025 per task versus $0.30+ for proprietary agents of comparable or lower accuracy",
    392       "evidence": "Table 10: Fara-7B $0.025, SoM GPT-5 $0.316, SoM o3 $0.514, OpenAI computer-use-preview $0.913; Figure 1 visualizes the Pareto frontier with pass@k curves",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "High-quality synthetic data is sufficient to enable a small 7B model to approach the capabilities of much larger frontier models",
    397       "evidence": "Fara-7B outperforms OpenAI computer-use-preview on WebVoyager and WebTailBench despite much smaller size; within 3 points of o3 on flights/hotels subcategories despite <4K training examples each",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Fara-7B achieves superior safety with 94.2% refusal rate on AgentHarm-Chat versus 84.6% for OpenAI computer-use-preview and 3.8% for UI-TARS-1.5-7B",
    402       "evidence": "Table 14 shows refusal rates across CUA models on AgentHarm-Chat and WebTailBench-Refusals; Fara-7B leads on both; note Fara-7B may have distributional advantage on WebTailBench-Refusals from similar training data",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Using Browserbase improves trajectory generation yield by more than 3x for complex tasks",
    407       "evidence": "Table 2 shows shopping yield increases from 9% to 35% and flights from 3% to 11% with Browserbase, representing 3.9x and 3.7x improvements respectively",
    408       "supported": "strong"
    409     },
    410     {
    411       "claim": "Fara-7B benefits equally from inference step scaling as UI-TARS despite using only SFT while UI-TARS uses extensive RL",
    412       "evidence": "Figure 7 (middle, right) shows similar scaling slopes for both models on WebVoyager and Online-Mind2Web as maximum steps increase from 15 to 100",
    413       "supported": "moderate"
    414     }
    415   ],
    416   "methodology_tags": [
    417     "benchmark-eval",
    418     "case-study"
    419   ],
    420   "key_findings": "Fara-7B, a 7B parameter CUA model trained via supervised fine-tuning on 145K synthetic web trajectories from FaraGen, achieves 73.5% on WebVoyager—outperforming UI-TARS-1.5-7B (66.4%), OpenAI computer-use-preview (70.9%), and SoM GPT-4o (65.1%)—at only $0.025 per task versus ~$0.30 for proprietary systems. FaraGen demonstrates that scalable synthetic data generation via multi-agent task proposal, automated solving, and multi-verifier filtering can produce high-quality CUA training data at ~$1 per trajectory. On the newly introduced WebTailBench, Fara-7B achieves 38.4% versus 25.7% for OpenAI computer-use-preview and 19.5% for UI-TARS, though frontier reasoning models (GPT-5: 60.4%, o3: 52.7%) remain substantially ahead on complex multi-step tasks. Positive data and inference step scaling trends suggest further improvements are achievable, and Fara-7B's SFT-only training shows equivalent step-budget scaling to RL-trained UI-TARS, a surprising finding that challenges assumptions about the necessity of RL for agentic scaling.",
    421   "red_flags": [
    422     {
    423       "flag": "Self-evaluation conflict",
    424       "detail": "All authors are Microsoft employees evaluating their own product (Fara-7B) with no independent evaluation; the paper also introduces and primarily evaluates on its own benchmark (WebTailBench), creating potential for benchmark-specific optimization."
    425     },
    426     {
    427       "flag": "Training data not released",
    428       "detail": "The 145K FaraGen trajectories central to the paper's main claims are not publicly released, making it impossible to independently verify training data quality, composition, or reproduce the model."
    429     },
    430     {
    431       "flag": "Live website evaluation instability",
    432       "detail": "Evaluation on live websites required modifying 98 WebVoyager tasks (48 removed as impossible, 50 modified with new dates), introducing selection bias and making direct comparisons with published results unreliable."
    433     },
    434     {
    435       "flag": "LLM-as-a-judge vs. human evaluation gap uncharacterized",
    436       "detail": "Human evaluation yields 62% vs. higher LLM-judge scores for Fara-7B, yet LLM-as-a-judge is the primary evaluation metric; the magnitude and direction of auto-eval inflation are not systematically characterized across all benchmarks and models."
    437     },
    438     {
    439       "flag": "Benchmark contamination unaddressed",
    440       "detail": "Both FaraGen training data (ClueWeb22, Tranco URLs) and test benchmarks draw from the same live web; potential domain overlap and base model (Qwen2.5-VL) pretraining contamination on benchmark examples are not discussed."
    441     },
    442     {
    443       "flag": "Safety evaluation underpowered",
    444       "detail": "Critical point evaluation uses only 23 synthetic tasks on simulated websites; WebTailBench-Refusals training data similarity may inflate Fara-7B's WebTailBench-Refusals results (acknowledged in paper), making safety comparisons partially confounded."
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "UI-TARS: Pioneering Automated GUI Interaction with Native Agents",
    450       "relevance": "Primary 7B-scale baseline sharing the same Qwen2.5-VL base model; key comparison point for demonstrating FaraGen data quality advantage independent of base model choice"
    451     },
    452     {
    453       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    454       "relevance": "Primary evaluation benchmark used for main results; represents the dominant prior approach to end-to-end web agents"
    455     },
    456     {
    457       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    458       "relevance": "Foundation for FaraGen's multi-agent task solving pipeline (Orchestrator + WebSurfer architecture that Fara-7B distills from)"
    459     },
    460     {
    461       "title": "Mind2Web: Towards a Generalist Agent for the Web",
    462       "relevance": "Key related benchmark and dataset for web agents; Online-Mind2Web variant used as evaluation benchmark"
    463     },
    464     {
    465       "title": "AgentInstruct: Toward Generative Teaching with Agentic Flows",
    466       "relevance": "Related synthetic data generation approach for agentic tasks; FaraGen's task proposal strategy builds on similar ideas"
    467     },
    468     {
    469       "title": "An Illusion of Progress? Assessing the Current State of Web Agents",
    470       "relevance": "Motivates multi-verifier design and the gap between auto-eval and human judgment; cited for verifier design approach"
    471     },
    472     {
    473       "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents",
    474       "relevance": "Safety evaluation benchmark used to measure Fara-7B's refusal capabilities against other CUA models"
    475     },
    476     {
    477       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    478       "relevance": "Key prior CUA evaluation environment; motivates WebTailBench's focus on live websites over static sandboxes"
    479     },
    480     {
    481       "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    482       "relevance": "Related CUA evaluation environment used to run UI-TARS-1.5-7B for baseline comparison"
    483     },
    484     {
    485       "title": "Explorer: Scaling Exploration-driven Web Trajectory Synthesis for Multimodal Web Agents",
    486       "relevance": "Related synthetic trajectory generation work used in FaraGen's agentic URL exploration strategy"
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 3,
    492       "justification": "Model weights publicly available on HuggingFace and Azure Foundry, inference harness on GitHub, benchmark released; directly applicable to practitioners building computer use agents with tight cost constraints."
    493     },
    494     "surprise_contrarian": {
    495       "score": 2,
    496       "justification": "The finding that a 7B SFT-only model can match RL-trained models on inference step scaling and outperform much larger OpenAI computer-use-preview challenges prevailing assumptions about model scale and RL necessity for agentic tasks."
    497     },
    498     "fear_safety": {
    499       "score": 2,
    500       "justification": "Computer use agents capable of taking real-world actions (purchases, reservations, emails) with limited oversight raise legitimate concerns; the paper addresses safety but acknowledges CUAs remain experimental and insufficient for deployment in sensitive contexts."
    501     },
    502     "drama_conflict": {
    503       "score": 1,
    504       "justification": "Microsoft challenging OpenAI's computer-use models has a competitive angle, but the paper is measured and technical rather than adversarial in framing."
    505     },
    506     "demo_ability": {
    507       "score": 3,
    508       "justification": "Model is immediately accessible via HuggingFace and Azure Foundry with a released inference harness; practitioners can run Fara-7B on their own web tasks today."
    509     },
    510     "brand_recognition": {
    511       "score": 3,
    512       "justification": "Microsoft Research paper comparing against GPT-5, o3, and OpenAI computer-use-preview; high brand recognition from both the producing institution and the frontier models used as reference points."
    513     }
    514   },
    515   "hn_data": {
    516     "threads": [
    517       {
    518         "hn_id": "46650465",
    519         "title": "Show HN: Agint Flow – design software as a graph, then compile the graph to code",
    520         "points": 5,
    521         "comments": 3,
    522         "url": "https://news.ycombinator.com/item?id=46650465",
    523         "created_at": "2026-01-16T18:56:09Z"
    524       },
    525       {
    526         "hn_id": "46380330",
    527         "title": "Breakthrough Listen Observations of 3I/Atlas with the Green Bank Telescope",
    528         "points": 3,
    529         "comments": 3,
    530         "url": "https://news.ycombinator.com/item?id=46380330",
    531         "created_at": "2025-12-24T23:14:21Z"
    532       }
    533     ],
    534     "top_points": 5,
    535     "total_points": 8,
    536     "total_comments": 6
    537   }
    538 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs