scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32818B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fara-7B: An Efficient Agentic Model for Computer Use",
      6     "authors": [
      7       "Ahmed Awadallah",
      8       "Yash Lara",
      9       "Raghav Magazine",
     10       "Hussein Mozannar",
     11       "Akshay Nambi",
     12       "Yash Pandya",
     13       "Aravind Rajeswaran",
     14       "Corby Rosset",
     15       "Alexey Taymanov",
     16       "Vibhav Vineet",
     17       "Spencer Whitehead",
     18       "Andrew Zhao"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2511.19663",
     23     "doi": "10.48550/arXiv.2511.19663"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Abstract claims about outperforming comparable-size models and being competitive with larger frontier models are supported by Tables 9 and 10. The ~$1 per trajectory claim is supported by Table 6.",
     31         "source": "opus"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Ablation studies (Table 4, Figure 7) support causal claims about the contribution of data scaling and pipeline modifications. These are controlled single-variable manipulations.",
     37         "source": "opus"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The title says 'Computer Use' broadly but evaluation is web-only. The paper acknowledges limitations (no drag-and-drop, no audio/video) in Section 7, but the abstract and framing suggest broader computer use capability than what was tested.",
     43         "source": "opus"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No substantive discussion of alternative explanations for the results. For example, the base model (Qwen2.5-VL) may already have web navigation capabilities; the paper doesn't control for this beyond the grounding comparison in Table 13a.",
     49         "source": "opus"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper measures success rate on benchmarks and frames this as 'agentic capabilities' and 'computer use' ability. The gap between benchmark task completion and real-world computer use utility is not discussed, though the human eval gap (62% vs 73.5%) hints at it.",
     55         "source": "opus"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 7 (Discussion) contains a 'Limitations' paragraph discussing action space limitations, accuracy on complex tasks, hallucinations, and the critical point framework's incompleteness.",
     63         "source": "opus"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Specific threats discussed: no training data beyond critical points may cause unexpected behavior (Section 2.2), BrowserBase dependency for reliable evaluation (Section 5.1.1), time-sensitive tasks going stale, human eval vs auto-eval gap (Section 5.1.2), small critical point evaluation dataset (Section 5.4).",
     69         "source": "opus"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 7 explicitly states what the model cannot do: drag-and-drop, video/audio consumption, game playing, ultra-low latency tasks. Guidelines for Safe Use state the model should not be used in regulated domains or commercial applications.",
     75         "source": "opus"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding source is disclosed. The paper is from 'AI Frontiers' (Microsoft) but no explicit funding statement is provided.",
     83         "source": "opus"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper is labeled 'AI Frontiers' which is a Microsoft research group. Authors are from Microsoft. The affiliation is visible in the header.",
     89         "source": "opus"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Microsoft funds this research and has a direct financial interest in the outcome — Fara-7B is released on Azure Foundry (a Microsoft product). The funder is not independent.",
     95         "source": "opus"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests or financial interests statement is provided. Microsoft employees are evaluating a Microsoft product released on a Microsoft platform.",
    101         "source": "opus"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are defined: 'Computer Use Agent' (perceive and take actions on computer), 'critical point' (binding transaction requiring user permission), 'pixel-in action-out' formulation, and 'SoM agent' (Set-of-Marks).",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Three explicit contributions are stated in Section 1: FaraGen (data engine), Fara-7B (CUA model), and WebTailBench (benchmark), each clearly described with distinct goals.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6 (Related Work) systematically covers tool-calling LLMs, multimodality/screen understanding, agentic CUA models, and CUA benchmarks, situating Fara-7B's pixel-in approach relative to DOM-based alternatives.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper provides a GitHub link (https://github.com/microsoft/fara) and model weights on HuggingFace (https://huggingface.co/microsoft/fara-7b) and Azure Foundry.",
    132           "source": "opus"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "WebTailBench (609 tasks) is stated to be released. The model is open-weight. However, the full 145K trajectory training dataset is not explicitly stated as released. WebTailBench counts as partial data release; standard benchmarks used (WebVoyager, Mind2Web) are public.",
    138           "source": "opus"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Appendix C mentions DeepSpeed Stage 3 and bf16 precision on 64 H100 GPUs, but no library versions or dependency specifications.",
    144           "source": "opus"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repo is referenced but the paper itself does not include a reproducing-results section.",
    150           "source": "opus"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Table 19 in Appendix D.3 reports mean ± standard deviation for all benchmarks across three runs (e.g., 'Fara-7B 73.5 ± 1.0' on WebVoyager).",
    158           "source": "opus"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No statistical significance tests are used. Claims of outperformance are based on comparing mean success rates without any hypothesis testing.",
    164           "source": "opus"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Absolute accuracy differences are reported with baseline context throughout (e.g., Fara-7B 73.5% vs UI-TARS-1.5-7B 66.4% on WebVoyager, Table 9). Cost differences are also contextualized ($0.025 vs $0.30+ per task, Table 10).",
    170           "source": "opus"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No justification for why the benchmark sizes or number of runs (3) are sufficient. The critical point evaluation uses only 23 tasks with no power analysis.",
    176           "source": "opus"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 19 reports standard deviation across three independent runs for all benchmarks. Tables 10 and 12 report mean ± std for actions per task, input/output tokens.",
    182           "source": "opus"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Multiple baselines included: UI-TARS-1.5-7B, OpenAI computer-use-preview, SoM agents with GPT-4o/o3/GPT-5, and GLM-4.1V-9B-Thinking (Tables 9, 11).",
    190           "source": "opus"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Baselines include GPT-5, o3, and UI-TARS-1.5-7B — all 2025 models. These are state-of-the-art at time of writing.",
    196           "source": "opus"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Table 4 shows cumulative ablations of the task solving pipeline. Figure 7 (left) shows data scaling ablation (1%, 10%, 100%). Figure 7 (middle/right) shows inference step scaling.",
    202           "source": "opus"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Success rate across 4 benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench), plus cost per task, actions per task, token usage, grounding accuracy (ScreenSpot), and safety refusal rate.",
    208           "source": "opus"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 5.1.2 states Browserbase independently verified Fara-7B with human annotators, establishing 62% accuracy on WebVoyager. However, the gap between auto-eval and human eval is acknowledged but not deeply explored.",
    214           "source": "opus"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Evaluation is on separate benchmarks (WebVoyager, Online-Mind2Web, DeepShop, WebTailBench) not used in training. The model was trained on FaraGen data, evaluated on independent benchmarks.",
    220           "source": "opus"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Table 11 provides per-category breakdown of WebTailBench across 11 segments. Table 13b breaks down grounding by Mobile/Desktop/Web and Text/Icon.",
    226           "source": "opus"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Section 2.2 discusses looping failures. Table 2 shows the 'funnel' of trajectory losses. Section 5.4 discusses 4 critical point failures. Section 7 discusses limitations including inability to drag-and-drop, hallucinations, etc.",
    232           "source": "opus"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 2 shows very low success rates for difficult tasks (flights 3% without BrowserBase). Table 4 shows weak baseline performance (33%). The human evaluation gap (62% human vs 73.5% auto) is a negative finding.",
    238           "source": "opus"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "The paper refers to 'GPT-4o', 'o3', 'GPT-5', 'o4-mini' without API version strings or snapshot dates. The base model is specified as 'Qwen2.5-VL-7B' which is more specific but still lacks a checkpoint hash or date.",
    246           "source": "opus"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "The paper describes prompts conceptually (e.g., the Orchestrator ledger fields in Table 1, verifier descriptions) but does not provide the actual prompt text used for any component. The appendix shows a single screenshot QA prompt excerpt but not the full prompts.",
    252           "source": "opus"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Appendix C reports training hyperparameters: AdamW with β1=0.9, β2=0.95, cosine warmup for 10% of steps, learning rate 5e-6, gradient clipping max 1, 2 epochs (~28k iterations), batch size 128, 64 H100 GPUs, DeepSpeed Stage 3, bf16.",
    258           "source": "opus"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The multi-agent scaffolding (Orchestrator, WebSurfer, UserSimulator, verifiers) is described in extensive detail in Sections 2.1-2.3 with figures, logic tables (Tables 1, 3), and the action space (Table 7).",
    264           "source": "opus"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Section 3.2 describes trajectory data processing: extracting screenshots/reasoning/actions from WebSurfer outputs, replacing SoM element IDs with center coordinates, keeping only N=3 recent observations. Table 16 shows data mixture composition.",
    270           "source": "opus"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "The full 145K trajectory training dataset is not released. Only the model weights, WebTailBench tasks, and code are released. The raw FaraGen data is not available for verification.",
    278           "source": "opus"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Section 2 extensively describes the data collection: three task proposal strategies (targeted URL, agentic exploration, exemplar), task solving with Magentic-One, and three-stage verification. Table 5 provides statistics. Table 2 shows per-segment yields.",
    284           "source": "opus"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "No human participants were recruited as subjects. Data is synthetically generated from websites. The third-party human evaluation by Browserbase is briefly described but this is evaluation, not data collection for training.",
    290           "source": "opus"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "The full pipeline is documented: task proposal → task solving → verification → filtering. Table 2 shows the funnel with error rates, completion rates, and verification rates per segment. Table 5 provides final statistics.",
    296           "source": "opus"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The base model Qwen2.5-VL's training cutoff is not stated. The FaraGen data is collected from live websites but no cutoff date is given for when data collection occurred.",
    304           "source": "opus"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether WebVoyager tasks, Online-Mind2Web tasks, or other benchmark tasks overlap with FaraGen training data or the base model's pretraining data.",
    310           "source": "opus"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "WebVoyager and Mind2Web were published before the model's likely training cutoff. No contamination analysis is performed. WebTailBench is new but the other benchmarks are not addressed.",
    316           "source": "opus"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants as research subjects. The third-party Browserbase evaluation is quality assurance, not a human subjects study.",
    324           "source": "opus"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants as research subjects.",
    330           "source": "opus"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants as research subjects.",
    336           "source": "opus"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants as research subjects.",
    342           "source": "opus"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants as research subjects.",
    348           "source": "opus"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants as research subjects.",
    354           "source": "opus"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participants as research subjects.",
    360           "source": "opus"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Table 10 reports $0.025 per task for Fara-7B on WebVoyager. Table 12 reports $0.069 per task on WebTailBench. Table 6 reports data generation costs (~$1 per trajectory).",
    368           "source": "opus"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": true,
    373           "justification": "Appendix C states 64 H100 GPUs for training, 2 epochs (~28k iterations). Section 2.2 states 40 nodes with 4 browsers each for data generation achieving 600 trajectories/hour. Table 6 gives per-trajectory generation costs.",
    374           "source": "opus"
    375         }
    376       },
    377       "experimental_rigor": {
    378         "seed_sensitivity_reported": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "No random seed sensitivity analysis. The model training does not report results across multiple training seeds. The 3 evaluation runs capture evaluation variance but not training variance.",
    382           "source": "opus"
    383         },
    384         "number_of_runs_stated": {
    385           "applies": true,
    386           "answer": true,
    387           "justification": "Section 5.1.1 states 'we run three independent evaluations for each online benchmark and report the average.' Table 19 confirms this with mean ± std across 3 runs.",
    388           "source": "opus"
    389         },
    390         "hyperparameter_search_budget": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "No hyperparameter search budget is reported. The paper mentions 'we tune the mixing ratios of the data' (Section 3.2) and 'based on early experiments, we set N=3' without stating how many configurations were tried.",
    394           "source": "opus"
    395         },
    396         "best_config_selection_justified": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "The final data mixture ratios, N=3 observation window, and other design choices appear tuned but the selection process is not documented. No validation set-based selection is described.",
    400           "source": "opus"
    401         },
    402         "multiple_comparison_correction": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "No statistical tests are performed at all, so no correction for multiple comparisons. Comparisons across 4 benchmarks and 7+ models would warrant correction.",
    406           "source": "opus"
    407         },
    408         "self_comparison_bias_addressed": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "The authors (Microsoft) evaluate their own model against competitors. The SoM agents use their own implementation. No discussion of self-evaluation bias. The Browserbase human eval partially mitigates this but the gap is not explored.",
    412           "source": "opus"
    413         },
    414         "compute_budget_vs_performance": {
    415           "applies": true,
    416           "answer": true,
    417           "justification": "Figure 1 and Tables 10/12 explicitly compare accuracy vs. cost across models. The paper's central argument is about the Pareto frontier of cost vs. performance.",
    418           "source": "opus"
    419         },
    420         "benchmark_construct_validity": {
    421           "applies": true,
    422           "answer": true,
    423           "justification": "Section 4 (WebTailBench) extensively discusses limitations of existing benchmarks — lack of diversity, unrealistic tasks, poor alignment with human judgment. The paper creates WebTailBench specifically to address construct validity gaps.",
    424           "source": "opus"
    425         },
    426         "scaffold_confound_addressed": {
    427           "applies": true,
    428           "answer": true,
    429           "justification": "The paper explicitly separates SoM agents (using accessibility trees) from native CUA models (using only screenshots) in all result tables. Section 7 discusses why SoM vs native CUA is a confound and compares at matched paradigms.",
    430           "source": "opus"
    431         }
    432       },
    433       "data_leakage": {
    434         "temporal_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the base model Qwen2.5-VL was trained on data that includes benchmark solutions or similar web interaction patterns.",
    438           "source": "opus"
    439         },
    440         "feature_leakage_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether the FaraGen training data includes websites or task types that overlap with evaluation benchmarks.",
    444           "source": "opus"
    445         },
    446         "non_independence_addressed": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No analysis of whether FaraGen training domains overlap with benchmark domains. The training data visits 70K unique domains; no check against benchmark domains is reported.",
    450           "source": "opus"
    451         },
    452         "leakage_detection_method": {
    453           "applies": true,
    454           "answer": false,
    455           "justification": "No leakage detection or prevention method is applied. No decontamination pipeline, overlap analysis, or temporal splits are described.",
    456           "source": "opus"
    457         }
    458       }
    459     }
    460   },
    461   "claims": [
    462     {
    463       "claim": "Fara-7B achieves state-of-the-art performance among 7B-scale CUA models, outperforming UI-TARS-1.5-7B on WebVoyager (73.5% vs 66.4%) and WebTailBench (38.4% vs 19.5%)",
    464       "evidence": "Tables 9 and 19 report results across 3 independent runs with standard deviations; Fara-7B consistently leads UI-TARS-1.5-7B on all four benchmarks",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Fara-7B is competitive with or outperforms larger proprietary models including OpenAI computer-use-preview and GPT-4o SoM agents on WebVoyager",
    469       "evidence": "Table 9 shows Fara-7B at 73.5% vs OpenAI computer-use-preview 70.9% and SoM GPT-4o 65.1%; however GPT-5 SoM scores 90.6%, showing clear frontier gap",
    470       "supported": "moderate"
    471     },
    472     {
    473       "claim": "FaraGen generates verified web trajectories for approximately $1 per task even using premium models",
    474       "evidence": "Table 6 shows $0.59 (o4-mini), $1.08 (o3), $1.00 (GPT-5) per trajectory based on a 600-trajectory sample averaging ~19 steps; sample may not represent full distribution",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "Fara-7B is 10x more cost-efficient than GPT-4o SoM agents, averaging $0.025/task vs ~$0.30",
    479       "evidence": "Table 10 confirms Fara-7B uses ~1.1k output tokens vs GPT-4o's ~1.8k and GPT-5's ~13k; pricing derived from official OpenAI and inferred third-party sources",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "Fara-7B demonstrates positive data scaling trends and similar inference step-budget scaling to UI-TARS despite using only SFT vs RL",
    484       "evidence": "Figure 7 shows performance improving from 1% to 100% training data; step-budget scaling curves for Fara-7B and UI-TARS-1.5-7B are nearly identical on both WebVoyager and Online-Mind2Web",
    485       "supported": "strong"
    486     },
    487     {
    488       "claim": "FaraGen verifier pipeline achieves 83.3% agreement with human judges (16.7% false positive rate)",
    489       "evidence": "Stated in Section 2.3 without reporting sample size for this verification study or methodology details for measuring verifier-human agreement",
    490       "supported": "weak"
    491     },
    492     {
    493       "claim": "Fara-7B achieves strongest safety refusal rates among CUA models (94.2% on AgentHarm-Chat vs 3.8% for UI-TARS-1.5-7B)",
    494       "evidence": "Table 14 reports results; however Fara-7B was specifically trained on safety data while UI-TARS-1.5-7B was not, making comparison partially unfair",
    495       "supported": "moderate"
    496     }
    497   ],
    498   "methodology_tags": [
    499     "benchmark-eval",
    500     "case-study"
    501   ],
    502   "key_findings": "Fara-7B demonstrates that a 7B-parameter model trained on synthetically generated web trajectories (FaraGen) can achieve state-of-the-art performance among small CUA models and outperform larger proprietary systems on several benchmarks at 10x lower cost. FaraGen generates verified multi-step web trajectories for ~$1 each using a multi-agent pipeline (Orchestrator + WebSurfer + verifiers), producing 145K training trajectories spanning 70K domains. Fara-7B's 'pixel-in, action-out' formulation—operating on screenshots without accessibility trees—achieves 73.5% on WebVoyager at $0.025/task. Data scaling experiments show consistent performance gains from 18K to 1.8M action steps, suggesting further improvements are achievable with more data.",
    503   "red_flags": [
    504     {
    505       "flag": "Self-introduced benchmark advantage",
    506       "detail": "WebTailBench is introduced and evaluated by the same Microsoft team; Fara-7B shows disproportionately large gains on WebTailBench (38.4% vs UI-TARS's 19.5%, a 97% relative improvement) compared to other benchmarks, raising questions about alignment between training data distribution and evaluation tasks."
    507     },
    508     {
    509       "flag": "Training-test website contamination",
    510       "detail": "FaraGen training data is drawn from the same live websites used in evaluation benchmarks (WebVoyager, DeepShop, etc.); the paper does not address whether training trajectories overlap with or contaminate test scenarios."
    511     },
    512     {
    513       "flag": "LLM-judge vs human eval 11.5pp gap",
    514       "detail": "Human evaluation found 62% accuracy vs 73.5% by LLM-as-judge—an 11.5 percentage point systematic inflation—acknowledged but not resolved; all main results use the LLM judge."
    515     },
    516     {
    517       "flag": "No statistical significance testing",
    518       "detail": "Despite reporting means and standard deviations across 3 runs, no significance tests are performed; with only 3 runs and high variance (e.g., Online-M2W ±3.7 for Fara-7B), some performance gaps may not be statistically meaningful."
    519     },
    520     {
    521       "flag": "16.7% verifier false positive rate in training data",
    522       "detail": "Trajectory verifier has 16.7% false positive rate, meaning a substantial fraction of the 145K training demonstrations may be incorrect; the impact on model quality is not analyzed."
    523     },
    524     {
    525       "flag": "Proprietary model pricing assumptions",
    526       "detail": "Cost comparisons for Fara-7B/UI-TARS-1.5-7B rely on third-party inference pricing ($0.20/M tokens) rather than official pricing; GLM pricing is extrapolated via a 72% markup heuristic from a different provider."
    527     }
    528   ],
    529   "cited_papers": [
    530     {
    531       "title": "UI-TARS: Pioneering Automated GUI Interaction with Native Agents",
    532       "relevance": "Primary comparison model sharing the same Qwen2.5-VL-7B base; represents the main competitive baseline for Fara-7B at equivalent scale"
    533     },
    534     {
    535       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    536       "relevance": "Foundation for FaraGen's task-solving pipeline; Orchestrator-WebSurfer architecture extends Magentic-One"
    537     },
    538     {
    539       "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models",
    540       "relevance": "Primary evaluation benchmark establishing the web agent evaluation protocol used throughout"
    541     },
    542     {
    543       "title": "An Illusion of Progress? Assessing the Current State of Web Agents",
    544       "relevance": "Provides Online-Mind2Web benchmark and analysis of the auto-eval vs human-eval gap cited by the paper"
    545     },
    546     {
    547       "title": "AgentInstruct: Toward Generative Teaching with Agentic Flows",
    548       "relevance": "Prior work on synthetic agentic training data generation that FaraGen's targeted URL task proposal builds on"
    549     },
    550     {
    551       "title": "SeeClick: Harnessing GUI Grounding for Advanced Visual GUI Agents",
    552       "relevance": "Source of grounding annotation data used in Fara-7B's auxiliary training mix"
    553     },
    554     {
    555       "title": "OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments",
    556       "relevance": "CUA evaluation environment used to run UI-TARS-1.5-7B baseline; establishes desktop-scale CUA benchmarking"
    557     },
    558     {
    559       "title": "Qwen2.5-VL Technical Report",
    560       "relevance": "Base model for Fara-7B; understanding its capabilities is essential for interpreting what FaraGen training adds"
    561     }
    562   ],
    563   "engagement_factors": {
    564     "practical_relevance": {
    565       "score": 3,
    566       "justification": "Model released on HuggingFace and Azure Foundry with inference harness; directly addresses on-device deployment of web agents at 10x lower cost than GPT-4o."
    567     },
    568     "surprise_contrarian": {
    569       "score": 2,
    570       "justification": "Challenges the assumption that frontier-size models are required for competitive CUA; 7B model matching GPT-4o at $0.025/task vs $0.30 is counterintuitive."
    571     },
    572     "fear_safety": {
    573       "score": 1,
    574       "justification": "Safety evaluation covers harmful task refusals and critical points, but the paper's primary framing is capability-positive rather than cautionary about CUA risks."
    575     },
    576     "drama_conflict": {
    577       "score": 1,
    578       "justification": "Implicit competition with OpenAI computer-use-preview and ByteDance's UI-TARS, but framing is collegial and technical rather than adversarial."
    579     },
    580     "demo_ability": {
    581       "score": 3,
    582       "justification": "Model immediately available on HuggingFace and Azure AI Foundry; GitHub inference harness allows direct testing of web agent capabilities."
    583     },
    584     "brand_recognition": {
    585       "score": 3,
    586       "justification": "Microsoft Research paper with direct comparisons against OpenAI GPT-5, o3, and computer-use-preview; high brand recognition on all sides of the comparison."
    587     }
    588   },
    589   "hn_data": {
    590     "threads": [
    591       {
    592         "hn_id": "46650465",
    593         "title": "Show HN: Agint Flow – design software as a graph, then compile the graph to code",
    594         "points": 5,
    595         "comments": 3,
    596         "url": "https://news.ycombinator.com/item?id=46650465",
    597         "created_at": "2026-01-16T18:56:09Z"
    598       },
    599       {
    600         "hn_id": "46380330",
    601         "title": "Breakthrough Listen Observations of 3I/Atlas with the Green Bank Telescope",
    602         "points": 3,
    603         "comments": 3,
    604         "url": "https://news.ycombinator.com/item?id=46380330",
    605         "created_at": "2025-12-24T23:14:21Z"
    606       }
    607     ],
    608     "top_points": 5,
    609     "total_points": 8,
    610     "total_comments": 6
    611   }
    612 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs