scan-v5.json (30931B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Fara-7B: An Efficient Agentic Model for Computer Use", 6 "authors": [ 7 "Ahmed Awadallah", 8 "Yash Lara", 9 "Raghav Magazine", 10 "Hussein Mozannar", 11 "Akshay Nambi", 12 "Yash Pandya", 13 "Aravind Rajeswaran", 14 "Corby Rosset", 15 "Alexey Taymanov", 16 "Vibhav Vineet", 17 "Spencer Whitehead", 18 "Andrew Zhao" 19 ], 20 "year": 2025, 21 "venue": "arXiv.org", 22 "arxiv_id": "2511.19663", 23 "doi": "10.48550/arXiv.2511.19663" 24 }, 25 "checklist": { 26 "claims_and_evidence": { 27 "abstract_claims_supported": { 28 "applies": true, 29 "answer": true, 30 "justification": "All abstract claims are supported: Fara-7B outperforms comparable models on WebVoyager (73.5% vs UI-TARS 66.4%), Online-Mind2Web (34.1% vs 31.3%), and WebTailBench (38.4% vs 19.5%), and FaraGen achieves ~$1 per trajectory as shown in Table 6.", 31 "source": "haiku" 32 }, 33 "causal_claims_justified": { 34 "applies": true, 35 "answer": true, 36 "justification": "Table 4 provides cumulative ablations of task-solving pipeline components showing causal contributions of each modification; Section 5.3 shows data scaling ablations from 18K to 1.8M action steps demonstrating causal effect of data quantity on performance.", 37 "source": "haiku" 38 }, 39 "generalization_bounded": { 40 "applies": true, 41 "answer": true, 42 "justification": "Claims are explicitly bounded to web-based CUA tasks; the limitations section acknowledges specific constraints (no drag-and-drop, no video/audio, reduced accuracy on complex tasks), and Discussion frames contributions within the web CUA domain.", 43 "source": "haiku" 44 }, 45 "alternative_explanations_discussed": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper attributes Fara-7B's superior performance over UI-TARS entirely to FaraGen data quality without considering alternatives such as differences in fine-tuning procedures, data mixture ratios, or domain-specific benchmark optimization.", 49 "source": "haiku" 50 }, 51 "proxy_outcome_distinction": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 5.1.2 explicitly acknowledges the gap between LLM-as-a-judge metrics and human evaluation (62% vs higher auto-eval scores), and calls for improved LLM-as-a-judge frameworks, demonstrating awareness of proxy limitations.", 55 "source": "haiku" 56 } 57 }, 58 "limitations_and_scope": { 59 "limitations_section_present": { 60 "applies": true, 61 "answer": true, 62 "justification": "A dedicated 'Limitations' paragraph appears in Section 7 (Discussion), covering action space limitations, reduced accuracy on complex tasks, susceptibility to hallucinations, and the incomplete framework for human-agent collaboration.", 63 "source": "haiku" 64 }, 65 "threats_to_validity_specific": { 66 "applies": true, 67 "answer": false, 68 "justification": "The limitations section lists generic model constraints (no drag-and-drop, no audio/video) rather than specific threats to validity; concerns about LLM-as-a-judge reliability, train/test domain overlap, and benchmark-specific optimization are not addressed as validity threats.", 69 "source": "haiku" 70 }, 71 "scope_boundaries_stated": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper explicitly bounds scope to web-based CUA tasks, notes Fara-7B is an 'experimental preview' not recommended for commercial or high-stakes applications, and provides specific use guidelines requiring sandboxed environments and human oversight.", 75 "source": "haiku" 76 } 77 }, 78 "conflicts_of_interest": { 79 "funding_disclosed": { 80 "applies": true, 81 "answer": false, 82 "justification": "No funding disclosure section is present in the paper; all authors are from Microsoft and this is a Microsoft Research product, but no formal funding statement appears.", 83 "source": "haiku" 84 }, 85 "affiliations_disclosed": { 86 "applies": true, 87 "answer": true, 88 "justification": "Microsoft affiliation is clearly evident through GitHub (github.com/microsoft/fara), HuggingFace (huggingface.co/microsoft/fara-7b), Azure Foundry links in the paper header, and references to 'Microsoft Responsible AI Policy'.", 89 "source": "haiku" 90 }, 91 "funder_independent_of_outcome": { 92 "applies": true, 93 "answer": false, 94 "justification": "All authors are Microsoft employees evaluating their own model (Fara-7B) and comparing against competing products; there is no independence between the funder/employer and the outcome being evaluated.", 95 "source": "haiku" 96 }, 97 "financial_interests_declared": { 98 "applies": true, 99 "answer": false, 100 "justification": "No competing interests statement, patent disclosure, or financial interests declaration appears anywhere in the paper.", 101 "source": "haiku" 102 } 103 }, 104 "scope_and_framing": { 105 "key_terms_defined": { 106 "applies": true, 107 "answer": true, 108 "justification": "Key terms are defined: 'Computer Use Agents (CUAs)' described in the introduction, 'Critical Points' explicitly defined in Section 2.2 with examples, 'pixel-in, action-out' formulation described in Section 3.1, and 'SoM Agents' explained in Section 5.", 109 "source": "haiku" 110 }, 111 "intended_contribution_clear": { 112 "applies": true, 113 "answer": true, 114 "justification": "The Contributions section explicitly lists three contributions: FaraGen (scalable synthetic data engine), Fara-7B (compact CUA model), and WebTailBench (new benchmark), each with clear descriptions of what they add.", 115 "source": "haiku" 116 }, 117 "engagement_with_prior_work": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 6 provides substantive related work covering tool-calling LLMs, multimodality, CUA models, and benchmarks, explaining how Fara-7B relates to and differs from prior approaches like UI-TARS, WebArena, and Mind2Web.", 121 "source": "haiku" 122 } 123 } 124 }, 125 "type_checklist": { 126 "empirical": { 127 "artifacts": { 128 "code_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Source code is available at https://github.com/microsoft/fara; model weights are released on HuggingFace (huggingface.co/microsoft/fara-7b) and Azure Foundry, and an inference harness is mentioned as released.", 132 "source": "haiku" 133 }, 134 "data_released": { 135 "applies": true, 136 "answer": true, 137 "justification": "WebTailBench (609 tasks) and the Task Verification system are being released; evaluation uses public benchmarks (WebVoyager, Online-Mind2Web, DeepShop); however the 145K FaraGen training trajectories central to the paper's claims are not released.", 138 "source": "haiku" 139 }, 140 "environment_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "Appendix C provides hyperparameters and mentions Playwright, Browserbase, and Azure Machine Learning, but no Dockerfile, requirements.txt, or complete dependency specification is provided for reproduction.", 144 "source": "haiku" 145 }, 146 "reproduction_instructions": { 147 "applies": true, 148 "answer": false, 149 "justification": "An inference harness is mentioned as released on GitHub, but step-by-step instructions for reproducing training or full evaluation results are not provided in the paper; training trajectory data is also unavailable.", 150 "source": "haiku" 151 } 152 }, 153 "statistical_methodology": { 154 "confidence_intervals_or_error_bars": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 19 (appendix) reports mean ± standard deviation across 3 independent evaluation runs for all models on all four benchmarks; Figure 1 and Figure 6 show pass@k curves providing additional variance context.", 158 "source": "haiku" 159 }, 160 "significance_tests": { 161 "applies": true, 162 "answer": false, 163 "justification": "No formal statistical significance tests are applied to comparative claims; the paper reports means and standard deviations across 3 runs but does not perform hypothesis testing to confirm that differences are statistically significant.", 164 "source": "haiku" 165 }, 166 "effect_sizes_reported": { 167 "applies": true, 168 "answer": true, 169 "justification": "Raw accuracy differences with baseline context are reported throughout (e.g., Fara-7B 73.5% vs UI-TARS 66.4% on WebVoyager; 38.4% vs 19.5% on WebTailBench; cost $0.025 vs $0.30+ for proprietary agents).", 170 "source": "haiku" 171 }, 172 "sample_size_justified": { 173 "applies": true, 174 "answer": false, 175 "justification": "The number of benchmark tasks and 3 evaluation runs are not statistically justified; 609 WebTailBench tasks and 3 independent runs are chosen without power analysis or justification for providing reliable performance estimates.", 176 "source": "haiku" 177 }, 178 "variance_reported": { 179 "applies": true, 180 "answer": true, 181 "justification": "Table 19 reports standard deviations for all models across 3 runs (e.g., Fara-7B: 73.5±1.0 on WebVoyager, 38.4±0.7 on WebTailBench); Tables 10 and 12 report standard deviations for per-task token and action counts.", 182 "source": "haiku" 183 } 184 }, 185 "evaluation_design": { 186 "baselines_included": { 187 "applies": true, 188 "answer": true, 189 "justification": "Multiple baselines are included covering both paradigms: SoM agents (GPT-4o, o3, GPT-5), GLM-4.1V-9B-Thinking, OpenAI computer-use-preview, and UI-TARS-1.5-7B (same base model as Fara-7B).", 190 "source": "haiku" 191 }, 192 "baselines_contemporary": { 193 "applies": true, 194 "answer": true, 195 "justification": "All baselines are from 2024-2025 (UI-TARS January 2025, GPT-5 and o3 accessed October-November 2025, OpenAI computer-use-preview contemporary), making them current with Fara-7B's development.", 196 "source": "haiku" 197 }, 198 "ablation_study": { 199 "applies": true, 200 "answer": true, 201 "justification": "Table 4 provides cumulative ablations of task-solving pipeline modifications on WebVoyager; Section 5.3 and Figure 7 show data scaling (1%, 10%, 100% of data) and inference step scaling ablations.", 202 "source": "haiku" 203 }, 204 "multiple_metrics": { 205 "applies": true, 206 "answer": true, 207 "justification": "Evaluation uses four task benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench), grounding benchmarks (ScreenSpot V1/V2), safety benchmarks (AgentHarm-Chat, WebTailBench-Refusals), and efficiency metrics (cost, tokens, actions per task).", 208 "source": "haiku" 209 }, 210 "human_evaluation": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 5.1.2 reports third-party human evaluation by Browserbase where annotators independently verified Fara-7B trajectories on WebVoyager tasks, establishing 62% accuracy versus higher LLM-judge scores.", 214 "source": "haiku" 215 }, 216 "held_out_test_set": { 217 "applies": true, 218 "answer": true, 219 "justification": "WebTailBench (609 tasks) serves as a held-out evaluation set not used for training; existing public benchmarks (WebVoyager, Online-Mind2Web, DeepShop) are also independent test sets.", 220 "source": "haiku" 221 }, 222 "per_category_breakdown": { 223 "applies": true, 224 "answer": true, 225 "justification": "Table 11 provides per-category WebTailBench results across all 11 segments (Shopping, Flights, Hotels, Restaurants, Activities, Ticketing, Real-Estate, Jobs/Careers, Shopping List, Comparison Shopping, Compositional Tasks) for all models.", 226 "source": "haiku" 227 }, 228 "failure_cases_discussed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Section 5.4 describes the 4 specific cases where Fara-7B failed to stop before critical points (marking email read, liking a post, publishing a post without confirmation); Table 2 shows failure rates by task segment; WebSurfer loop failures are analyzed quantitatively.", 232 "source": "haiku" 233 }, 234 "negative_results_reported": { 235 "applies": true, 236 "answer": true, 237 "justification": "The paper reports poor real-estate task performance (23.6%, lowest category), 4/23 critical point failures, low trajectory yield for difficult segments (3% for flights without Browserbase), and weaker compositional task performance relative to frontier models.", 238 "source": "haiku" 239 } 240 }, 241 "setup_transparency": { 242 "model_versions_specified": { 243 "applies": true, 244 "answer": true, 245 "justification": "Model versions are specified: Qwen2.5-VL-7B as base model, GPT-4o (Hurst et al., 2024), o3 and GPT-5 with system cards cited, UI-TARS-1.5-7B (Qin et al., 2025); OpenAI models noted as accessed in October and November 2025.", 246 "source": "haiku" 247 }, 248 "prompts_provided": { 249 "applies": true, 250 "answer": false, 251 "justification": "The paper states they 'retain the same prompts... published with each benchmark' for evaluation but does not reproduce actual prompts; data generation prompts are described at a high level without full text.", 252 "source": "haiku" 253 }, 254 "hyperparameters_reported": { 255 "applies": true, 256 "answer": true, 257 "justification": "Appendix C provides full training hyperparameters: AdamW with β1=0.9, β2=0.95, cosine LR warmup, initial LR 5e-6, gradient clipping max 1, 2 epochs (~28k iterations), batch size 128, 64 H100 GPUs, DeepSpeed Stage 3, bf16 precision.", 258 "source": "haiku" 259 }, 260 "scaffolding_described": { 261 "applies": true, 262 "answer": true, 263 "justification": "The full Orchestrator-WebSurfer scaffolding is described in detail including the ledger system (Table 1), stopping logic (Table 3), UserSimulator behavior, Trajectory Verification pipeline with three complementary verifiers, and Fara-7B's inference-time formulation (Section 3.1).", 264 "source": "haiku" 265 }, 266 "data_preprocessing_documented": { 267 "applies": true, 268 "answer": true, 269 "justification": "Data preprocessing is documented: SoM element IDs replaced with bounding box center coordinates; data mixing ratios shown in Table 16 (1.2M trajectory steps, 562K grounding, 3K refusals, 1.8K UI VQA/captioning); upsampling of longer trajectories described.", 270 "source": "haiku" 271 } 272 }, 273 "data_integrity": { 274 "raw_data_available": { 275 "applies": true, 276 "answer": false, 277 "justification": "The 145K FaraGen training trajectories are not publicly released; only WebTailBench (609 tasks) and the verification system are being released, making independent verification of training data quality impossible.", 278 "source": "haiku" 279 }, 280 "data_collection_described": { 281 "applies": true, 282 "answer": true, 283 "justification": "The full FaraGen data collection pipeline is described in detail in Section 2, including three task proposal strategies, multi-agent task solving architecture, and three-verifier trajectory filtering with agreement statistics (83.3% with human judgments, 16.7% false positive rate).", 284 "source": "haiku" 285 }, 286 "recruitment_methods_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "Standard benchmark evaluation with automated and third-party human verification; no participant recruitment for a primary study.", 290 "source": "haiku" 291 }, 292 "data_pipeline_documented": { 293 "applies": true, 294 "answer": true, 295 "justification": "The complete data pipeline from URL seed selection through task proposal, solving, verification, and filtering is documented in Sections 2.1-2.4 with funnel statistics at each stage (Table 2 shows error rates, completion rates, and verification success rates per task segment).", 296 "source": "haiku" 297 } 298 }, 299 "contamination": { 300 "training_cutoff_stated": { 301 "applies": true, 302 "answer": false, 303 "justification": "The training data cutoff for the Qwen2.5-VL-7B base model is not stated; FaraGen data collection dates are also unspecified, leaving uncertainty about whether benchmark examples appeared in base model pretraining.", 304 "source": "haiku" 305 }, 306 "train_test_overlap_discussed": { 307 "applies": true, 308 "answer": false, 309 "justification": "The paper does not discuss potential overlap between FaraGen's training URLs (ClueWeb22, Tranco web corpus) and benchmark test websites (WebVoyager, Online-Mind2Web domains), despite both drawing from the same live web ecosystem.", 310 "source": "haiku" 311 }, 312 "benchmark_contamination_addressed": { 313 "applies": true, 314 "answer": false, 315 "justification": "Neither the base model (Qwen2.5-VL) contamination on benchmark examples nor potential domain overlap between FaraGen training sites and WebVoyager/Mind2Web test sites is discussed.", 316 "source": "haiku" 317 } 318 }, 319 "human_studies": { 320 "pre_registered": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participant study; third-party human evaluation by Browserbase is a quality verification exercise, not a controlled study.", 324 "source": "haiku" 325 }, 326 "irb_or_ethics_approval": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participant study requiring IRB approval.", 330 "source": "haiku" 331 }, 332 "demographics_reported": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants in the study; annotator demographics not applicable.", 336 "source": "haiku" 337 }, 338 "inclusion_exclusion_criteria": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participant study.", 342 "source": "haiku" 343 }, 344 "randomization_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participant study requiring randomization.", 348 "source": "haiku" 349 }, 350 "blinding_described": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participant study requiring blinding.", 354 "source": "haiku" 355 }, 356 "attrition_reported": { 357 "applies": false, 358 "answer": false, 359 "justification": "No human participant study.", 360 "source": "haiku" 361 } 362 }, 363 "cost_and_practicality": { 364 "inference_cost_reported": { 365 "applies": true, 366 "answer": true, 367 "justification": "Table 10 reports per-task cost for Fara-7B ($0.025 on WebVoyager) and all baselines; Table 12 shows per-task cost on WebTailBench ($0.069); cost components (input/output tokens with per-token pricing) are detailed in Appendix A.", 368 "source": "haiku" 369 }, 370 "compute_budget_stated": { 371 "applies": true, 372 "answer": true, 373 "justification": "Training used 64 H100 GPUs for ~28K iterations (2 epochs); data generation cost estimated in Table 6 ($0.59-$1.08 per trajectory); data generation infrastructure described as 40 Azure ML nodes running 4 browsers each (600 trajectories/hour throughput).", 374 "source": "haiku" 375 } 376 } 377 } 378 }, 379 "claims": [ 380 { 381 "claim": "Fara-7B achieves 73.5% on WebVoyager, outperforming all other 7B-scale CUA models and larger systems including OpenAI computer-use-preview (70.9%)", 382 "evidence": "Table 9 shows Fara-7B (73.5%) vs UI-TARS-1.5-7B (66.4%), OpenAI computer-use-preview (70.9%), SoM GPT-4o (65.1%); Table 19 shows 73.5±1.0 across 3 independent runs", 383 "supported": "strong" 384 }, 385 { 386 "claim": "FaraGen generates verified web trajectories at approximately $1 per task using premium models, enabling large-scale CUA data creation", 387 "evidence": "Table 6 shows costs of $0.59 (o4-mini), $1.08 (o3), $1.00 (GPT-5) per trajectory; 145K trajectories generated at this cost spanning 70K unique domains", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Fara-7B achieves a new Pareto frontier of accuracy vs. cost at $0.025 per task versus $0.30+ for proprietary agents of comparable or lower accuracy", 392 "evidence": "Table 10: Fara-7B $0.025, SoM GPT-5 $0.316, SoM o3 $0.514, OpenAI computer-use-preview $0.913; Figure 1 visualizes the Pareto frontier with pass@k curves", 393 "supported": "strong" 394 }, 395 { 396 "claim": "High-quality synthetic data is sufficient to enable a small 7B model to approach the capabilities of much larger frontier models", 397 "evidence": "Fara-7B outperforms OpenAI computer-use-preview on WebVoyager and WebTailBench despite much smaller size; within 3 points of o3 on flights/hotels subcategories despite <4K training examples each", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Fara-7B achieves superior safety with 94.2% refusal rate on AgentHarm-Chat versus 84.6% for OpenAI computer-use-preview and 3.8% for UI-TARS-1.5-7B", 402 "evidence": "Table 14 shows refusal rates across CUA models on AgentHarm-Chat and WebTailBench-Refusals; Fara-7B leads on both; note Fara-7B may have distributional advantage on WebTailBench-Refusals from similar training data", 403 "supported": "strong" 404 }, 405 { 406 "claim": "Using Browserbase improves trajectory generation yield by more than 3x for complex tasks", 407 "evidence": "Table 2 shows shopping yield increases from 9% to 35% and flights from 3% to 11% with Browserbase, representing 3.9x and 3.7x improvements respectively", 408 "supported": "strong" 409 }, 410 { 411 "claim": "Fara-7B benefits equally from inference step scaling as UI-TARS despite using only SFT while UI-TARS uses extensive RL", 412 "evidence": "Figure 7 (middle, right) shows similar scaling slopes for both models on WebVoyager and Online-Mind2Web as maximum steps increase from 15 to 100", 413 "supported": "moderate" 414 } 415 ], 416 "methodology_tags": [ 417 "benchmark-eval", 418 "case-study" 419 ], 420 "key_findings": "Fara-7B, a 7B parameter CUA model trained via supervised fine-tuning on 145K synthetic web trajectories from FaraGen, achieves 73.5% on WebVoyager—outperforming UI-TARS-1.5-7B (66.4%), OpenAI computer-use-preview (70.9%), and SoM GPT-4o (65.1%)—at only $0.025 per task versus ~$0.30 for proprietary systems. FaraGen demonstrates that scalable synthetic data generation via multi-agent task proposal, automated solving, and multi-verifier filtering can produce high-quality CUA training data at ~$1 per trajectory. On the newly introduced WebTailBench, Fara-7B achieves 38.4% versus 25.7% for OpenAI computer-use-preview and 19.5% for UI-TARS, though frontier reasoning models (GPT-5: 60.4%, o3: 52.7%) remain substantially ahead on complex multi-step tasks. Positive data and inference step scaling trends suggest further improvements are achievable, and Fara-7B's SFT-only training shows equivalent step-budget scaling to RL-trained UI-TARS, a surprising finding that challenges assumptions about the necessity of RL for agentic scaling.", 421 "red_flags": [ 422 { 423 "flag": "Self-evaluation conflict", 424 "detail": "All authors are Microsoft employees evaluating their own product (Fara-7B) with no independent evaluation; the paper also introduces and primarily evaluates on its own benchmark (WebTailBench), creating potential for benchmark-specific optimization." 425 }, 426 { 427 "flag": "Training data not released", 428 "detail": "The 145K FaraGen trajectories central to the paper's main claims are not publicly released, making it impossible to independently verify training data quality, composition, or reproduce the model." 429 }, 430 { 431 "flag": "Live website evaluation instability", 432 "detail": "Evaluation on live websites required modifying 98 WebVoyager tasks (48 removed as impossible, 50 modified with new dates), introducing selection bias and making direct comparisons with published results unreliable." 433 }, 434 { 435 "flag": "LLM-as-a-judge vs. human evaluation gap uncharacterized", 436 "detail": "Human evaluation yields 62% vs. higher LLM-judge scores for Fara-7B, yet LLM-as-a-judge is the primary evaluation metric; the magnitude and direction of auto-eval inflation are not systematically characterized across all benchmarks and models." 437 }, 438 { 439 "flag": "Benchmark contamination unaddressed", 440 "detail": "Both FaraGen training data (ClueWeb22, Tranco URLs) and test benchmarks draw from the same live web; potential domain overlap and base model (Qwen2.5-VL) pretraining contamination on benchmark examples are not discussed." 441 }, 442 { 443 "flag": "Safety evaluation underpowered", 444 "detail": "Critical point evaluation uses only 23 synthetic tasks on simulated websites; WebTailBench-Refusals training data similarity may inflate Fara-7B's WebTailBench-Refusals results (acknowledged in paper), making safety comparisons partially confounded." 445 } 446 ], 447 "cited_papers": [ 448 { 449 "title": "UI-TARS: Pioneering Automated GUI Interaction with Native Agents", 450 "relevance": "Primary 7B-scale baseline sharing the same Qwen2.5-VL base model; key comparison point for demonstrating FaraGen data quality advantage independent of base model choice" 451 }, 452 { 453 "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models", 454 "relevance": "Primary evaluation benchmark used for main results; represents the dominant prior approach to end-to-end web agents" 455 }, 456 { 457 "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks", 458 "relevance": "Foundation for FaraGen's multi-agent task solving pipeline (Orchestrator + WebSurfer architecture that Fara-7B distills from)" 459 }, 460 { 461 "title": "Mind2Web: Towards a Generalist Agent for the Web", 462 "relevance": "Key related benchmark and dataset for web agents; Online-Mind2Web variant used as evaluation benchmark" 463 }, 464 { 465 "title": "AgentInstruct: Toward Generative Teaching with Agentic Flows", 466 "relevance": "Related synthetic data generation approach for agentic tasks; FaraGen's task proposal strategy builds on similar ideas" 467 }, 468 { 469 "title": "An Illusion of Progress? Assessing the Current State of Web Agents", 470 "relevance": "Motivates multi-verifier design and the gap between auto-eval and human judgment; cited for verifier design approach" 471 }, 472 { 473 "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents", 474 "relevance": "Safety evaluation benchmark used to measure Fara-7B's refusal capabilities against other CUA models" 475 }, 476 { 477 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 478 "relevance": "Key prior CUA evaluation environment; motivates WebTailBench's focus on live websites over static sandboxes" 479 }, 480 { 481 "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments", 482 "relevance": "Related CUA evaluation environment used to run UI-TARS-1.5-7B for baseline comparison" 483 }, 484 { 485 "title": "Explorer: Scaling Exploration-driven Web Trajectory Synthesis for Multimodal Web Agents", 486 "relevance": "Related synthetic trajectory generation work used in FaraGen's agentic URL exploration strategy" 487 } 488 ], 489 "engagement_factors": { 490 "practical_relevance": { 491 "score": 3, 492 "justification": "Model weights publicly available on HuggingFace and Azure Foundry, inference harness on GitHub, benchmark released; directly applicable to practitioners building computer use agents with tight cost constraints." 493 }, 494 "surprise_contrarian": { 495 "score": 2, 496 "justification": "The finding that a 7B SFT-only model can match RL-trained models on inference step scaling and outperform much larger OpenAI computer-use-preview challenges prevailing assumptions about model scale and RL necessity for agentic tasks." 497 }, 498 "fear_safety": { 499 "score": 2, 500 "justification": "Computer use agents capable of taking real-world actions (purchases, reservations, emails) with limited oversight raise legitimate concerns; the paper addresses safety but acknowledges CUAs remain experimental and insufficient for deployment in sensitive contexts." 501 }, 502 "drama_conflict": { 503 "score": 1, 504 "justification": "Microsoft challenging OpenAI's computer-use models has a competitive angle, but the paper is measured and technical rather than adversarial in framing." 505 }, 506 "demo_ability": { 507 "score": 3, 508 "justification": "Model is immediately accessible via HuggingFace and Azure Foundry with a released inference harness; practitioners can run Fara-7B on their own web tasks today." 509 }, 510 "brand_recognition": { 511 "score": 3, 512 "justification": "Microsoft Research paper comparing against GPT-5, o3, and OpenAI computer-use-preview; high brand recognition from both the producing institution and the frontier models used as reference points." 513 } 514 }, 515 "hn_data": { 516 "threads": [ 517 { 518 "hn_id": "46650465", 519 "title": "Show HN: Agint Flow – design software as a graph, then compile the graph to code", 520 "points": 5, 521 "comments": 3, 522 "url": "https://news.ycombinator.com/item?id=46650465", 523 "created_at": "2026-01-16T18:56:09Z" 524 }, 525 { 526 "hn_id": "46380330", 527 "title": "Breakthrough Listen Observations of 3I/Atlas with the Green Bank Telescope", 528 "points": 3, 529 "comments": 3, 530 "url": "https://news.ycombinator.com/item?id=46380330", 531 "created_at": "2025-12-24T23:14:21Z" 532 } 533 ], 534 "top_points": 5, 535 "total_points": 8, 536 "total_comments": 6 537 } 538 }