scan.json (24952B)
1 { 2 "paper": { 3 "title": "How Far Are We from Genuinely Useful Deep Research Agents?", 4 "authors": [ 5 "Dingling Zhang", 6 "He Zhu", 7 "Jincheng Ren", 8 "Kangqi Song", 9 "Xinran Zhou", 10 "Boyu Feng", 11 "Shudong Liu", 12 "Jiabin Luo", 13 "Weihao Xie", 14 "Zhaohui Wang", 15 "Tianrui Qin", 16 "King Zhu", 17 "Yuqing Wang", 18 "Qianben Chen", 19 "Yuchen Eleanor Jiang", 20 "Wei Wang", 21 "Wangchunshu Zhou", 22 "Jiaheng Liu" 23 ], 24 "year": 2025, 25 "venue": "arXiv", 26 "arxiv_id": "2512.01948", 27 "doi": "10.48550/arXiv.2512.01948" 28 }, 29 "scan_version": 2, 30 "active_modules": ["experimental_rigor"], 31 "checklist": { 32 "artifacts": { 33 "code_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "GitHub link provided: https://github.com/OPPO-PersonalAI/FINDER_DEFT, listed in the abstract area as 'Code & Data'." 37 }, 38 "data_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "The same GitHub repository is listed for both code and data. The benchmark (FINDER) with 100 tasks and 419 checklists is indicated as released." 42 }, 43 "environment_specified": { 44 "applies": true, 45 "answer": false, 46 "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper." 47 }, 48 "reproduction_instructions": { 49 "applies": true, 50 "answer": false, 51 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no details on how to reproduce experiments." 52 } 53 }, 54 "statistical_methodology": { 55 "confidence_intervals_or_error_bars": { 56 "applies": true, 57 "answer": true, 58 "justification": "Table I.3 reports standard deviations for MiroFlow stability analysis across three runs. However, the main results in Table 1 do not include error bars or CIs." 59 }, 60 "significance_tests": { 61 "applies": true, 62 "answer": false, 63 "justification": "No statistical significance tests are used to support claims of performance differences between systems. Rankings are based on raw score comparisons." 64 }, 65 "effect_sizes_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Performance differences are reported with baseline context, e.g., Gemini at 50.95 vs. Perplexity at 41.62 on RACE overall, and percentage breakdowns for failure categories (e.g., 'over 39% of failures arise in content generation')." 69 }, 70 "sample_size_justified": { 71 "applies": true, 72 "answer": false, 73 "justification": "No justification for why 100 tasks were chosen, or why ~1000 reports were analyzed. No power analysis or discussion of sample adequacy." 74 }, 75 "variance_reported": { 76 "applies": true, 77 "answer": false, 78 "justification": "Main results (Table 1) are single-run numbers for most systems. Only MiroFlow has multi-run variance reported (Appendix I, 3 runs). Other models show no variance information." 79 } 80 }, 81 "evaluation_design": { 82 "baselines_included": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper compares 13 systems across proprietary APIs, open-source models, and agent frameworks. It also compares FINDER against the original DeepResearch Bench (DRB)." 86 }, 87 "baselines_contemporary": { 88 "applies": true, 89 "answer": true, 90 "justification": "Baselines include Gemini 2.5 Pro, O3, O4-Mini, Kimi K2, and other 2025-era systems. These are contemporary and competitive." 91 }, 92 "ablation_study": { 93 "applies": true, 94 "answer": true, 95 "justification": "The DRB vs FINDER comparison (Section 4.3, Figure 4) serves as an ablation showing the effect of refined prompts and checklists on evaluation outcomes." 96 }, 97 "multiple_metrics": { 98 "applies": true, 99 "answer": true, 100 "justification": "Multiple evaluation frameworks used: RACE (4 sub-dimensions), FACT (2 metrics), Positive Taxonomy Metrics (3 dimensions), and Checklist Pass Rate." 101 }, 102 "human_evaluation": { 103 "applies": true, 104 "answer": true, 105 "justification": "Four domain experts evaluated report outputs from WebThinker and OpenManus. ICR was computed between human and LLM annotations (Table 3). Seven experts curated the benchmark tasks." 106 }, 107 "held_out_test_set": { 108 "applies": true, 109 "answer": true, 110 "justification": "For DEFT validation (Section 3.2.3), 36 execution records from WebThinker and OpenManus (not involved in taxonomy construction) were used as a held-out set to test theoretical saturation." 111 }, 112 "per_category_breakdown": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table 1 provides per-dimension breakdowns (RACE sub-scores, FACT sub-scores, per-category taxonomy scores). Figure 3 shows per-category failure distributions." 116 }, 117 "failure_cases_discussed": { 118 "applies": true, 119 "answer": true, 120 "justification": "Appendix C provides extensive case studies for each of the 14 failure categories, with specific task IDs, model sources, and detailed analysis of what went wrong." 121 }, 122 "negative_results_reported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper documents that current DRAs struggle with evidence integration and verification. Specific system weaknesses are reported, e.g., Kimi K2 and O4-Mini suffering 'sharp decline in generation scores' despite strong reasoning." 126 } 127 }, 128 "claims_and_evidence": { 129 "abstract_claims_supported": { 130 "applies": true, 131 "answer": true, 132 "justification": "Abstract claims about 39% generation failures and 32% retrieval failures are supported by Figure 3 and the taxonomy analysis. Claims about evidence integration struggles are supported by Table 1 results." 133 }, 134 "causal_claims_justified": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper makes causal claims like 'reasoning resilience... is the key factor determining whether an agent can consistently produce high-quality deep research outcomes' (Insight 1) without adequate causal identification. The correlation matrix (Figure 6) shows associations but is presented as evidence for causal failure pathways." 138 }, 139 "generalization_bounded": { 140 "applies": true, 141 "answer": false, 142 "justification": "The title asks 'How Far Are We from Genuinely Useful Deep Research Agents?' — a broad claim. Results are based on 100 tasks from DeepResearch Bench, which covers specific domains. The paper does not bound generalizations to these specific task types." 143 }, 144 "alternative_explanations_discussed": { 145 "applies": true, 146 "answer": false, 147 "justification": "No discussion of alternative explanations for observed failure patterns. For example, differences could stem from prompt sensitivity, API version changes, or evaluation framework biases rather than fundamental agent limitations." 148 }, 149 "proxy_outcome_distinction": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper measures checklist pass rates, RACE scores, and taxonomy metrics as proxies for 'genuine usefulness' of deep research agents but does not discuss the gap between these automated/semi-automated measures and actual real-world utility." 153 } 154 }, 155 "setup_transparency": { 156 "model_versions_specified": { 157 "applies": true, 158 "answer": false, 159 "justification": "Some models use marketing names without exact versions: 'Gemini-2.5-Pro Deep Research', 'O3 Deep Research', 'O4-Mini Deep Research'. Open-source models specify some versions (e.g., 'WebThinker-QwQ-32B', 'AFM-45B') but proprietary APIs lack snapshot dates." 160 }, 161 "prompts_provided": { 162 "applies": true, 163 "answer": true, 164 "justification": "Appendix A.2 provides example queries with full checklist items. Appendix D shows a failure analysis report example. Appendix G provides seed conceptual categories used for coding prompts." 165 }, 166 "hyperparameters_reported": { 167 "applies": true, 168 "answer": true, 169 "justification": "Appendix K reports configurations: AFM (temperature=0.4, top_p=0.9, max_tokens=32K), MiroFlow (temperature=0.3, top_p=0.95, max_tokens=32K), OpenManus (temperature=0.0, max_tokens=8192). Proprietary APIs use 'default configurations'." 170 }, 171 "scaffolding_described": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper evaluates agent frameworks (OWL, OpenManus, MiroFlow) but provides minimal description of their internal scaffolding. It says MiroFlow is a 'Dual-agent framework' and OWL is a 'Multi-agent architecture' without detailing the scaffolding that would explain performance differences." 175 }, 176 "data_preprocessing_documented": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 3.1 documents the benchmark construction pipeline: prompt refinement by 7 domain experts, checklist construction with iterative Gemini 2.5 Flash refinement, and independent quality evaluation. Section 3.2 details the coding pipeline with explicit stages." 180 } 181 }, 182 "limitations_and_scope": { 183 "limitations_section_present": { 184 "applies": true, 185 "answer": false, 186 "justification": "No dedicated limitations section is present in the paper. The conclusion is brief and does not discuss limitations." 187 }, 188 "threats_to_validity_specific": { 189 "applies": true, 190 "answer": false, 191 "justification": "No threats to validity are discussed. Issues like LLM judge reliability, benchmark representativeness, or evaluation framework biases are not addressed." 192 }, 193 "scope_boundaries_stated": { 194 "applies": true, 195 "answer": false, 196 "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, task types, or agent architectures are excluded from its claims." 197 } 198 }, 199 "data_integrity": { 200 "raw_data_available": { 201 "applies": true, 202 "answer": true, 203 "justification": "The GitHub repository is indicated to contain the benchmark data and code. The 100 tasks and 419 checklists would enable verification." 204 }, 205 "data_collection_described": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section 3.1 describes benchmark construction from DeepResearch Bench with expert refinement. Section 3.2 describes the grounded-theory-based taxonomy construction with detailed stages." 209 }, 210 "recruitment_methods_described": { 211 "applies": true, 212 "answer": false, 213 "justification": "Seven domain experts and four annotators are mentioned but their recruitment, expertise domains, and potential selection biases are not described. We learn only that they are 'domain experts'." 214 }, 215 "data_pipeline_documented": { 216 "applies": true, 217 "answer": true, 218 "justification": "Table F.1 documents the full coding pipeline with generation counts, refinement rounds, and group partitioning. Algorithms 1-4 in Appendix F formalize each stage." 219 } 220 }, 221 "conflicts_of_interest": { 222 "funding_disclosed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding source is disclosed. The paper is by 'OPPO AI Agent Team' but no funding acknowledgments section exists." 226 }, 227 "affiliations_disclosed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper is identified as from 'OPPO AI Agent Team'. Correspondence addresses include @oppo.com and @nju.edu.cn." 231 }, 232 "funder_independent_of_outcome": { 233 "applies": true, 234 "answer": false, 235 "justification": "OPPO is a technology company with potential commercial interest in deep research agent capabilities. They evaluate their own MiroFlow and MiroThinker systems alongside competitors, creating a non-independent evaluation. No disclosure of this conflict." 236 }, 237 "financial_interests_declared": { 238 "applies": true, 239 "answer": false, 240 "justification": "No competing interests statement is present. OPPO employees evaluating OPPO's MiroFlow/MiroThinker systems against competitors is an undisclosed financial interest." 241 } 242 }, 243 "contamination": { 244 "training_cutoff_stated": { 245 "applies": false, 246 "answer": false, 247 "justification": "This paper evaluates agents on report generation tasks, not pre-trained model knowledge on benchmarks. The benchmark tests synthesis/retrieval capabilities, not memorized answers." 248 }, 249 "train_test_overlap_discussed": { 250 "applies": false, 251 "answer": false, 252 "justification": "Report generation tasks with web retrieval are not susceptible to traditional train/test contamination." 253 }, 254 "benchmark_contamination_addressed": { 255 "applies": false, 256 "answer": false, 257 "justification": "The benchmark tests open-ended report generation with live web retrieval, not closed-form answers that could be memorized." 258 } 259 }, 260 "human_studies": { 261 "pre_registered": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the traditional sense. Expert annotators are used for validation, not as study subjects." 265 }, 266 "irb_or_ethics_approval": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human subjects study. Expert annotators are collaborators, not participants." 270 }, 271 "demographics_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants study." 275 }, 276 "inclusion_exclusion_criteria": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants study." 280 }, 281 "randomization_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants study." 285 }, 286 "blinding_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants study." 290 }, 291 "attrition_reported": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants study." 295 } 296 }, 297 "cost_and_practicality": { 298 "inference_cost_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No inference costs, API costs, or latency are reported for any of the evaluated systems despite testing ~1000 report generations across 13 systems." 302 }, 303 "compute_budget_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "No total computational budget is stated for running the ~1000 report generations or the annotation/coding process." 307 } 308 }, 309 "experimental_rigor": { 310 "seed_sensitivity_reported": { 311 "applies": true, 312 "answer": false, 313 "justification": "Only MiroFlow was run 3 times (Appendix I). All other 12 systems appear to be single-run evaluations with no seed sensitivity analysis." 314 }, 315 "number_of_runs_stated": { 316 "applies": true, 317 "answer": false, 318 "justification": "Number of runs is stated only for MiroFlow (3 runs, Appendix I). For other systems, it is unclear whether results are single-run or averaged." 319 }, 320 "hyperparameter_search_budget": { 321 "applies": true, 322 "answer": false, 323 "justification": "No hyperparameter search budget is reported. It is unclear whether the reported configurations were tuned or are defaults." 324 }, 325 "best_config_selection_justified": { 326 "applies": true, 327 "answer": false, 328 "justification": "No justification for why specific configurations were chosen. Appendix K states proprietary APIs used 'default configurations' but does not justify open-source model settings." 329 }, 330 "multiple_comparison_correction": { 331 "applies": true, 332 "answer": false, 333 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across 13 systems and multiple metrics." 334 }, 335 "self_comparison_bias_addressed": { 336 "applies": true, 337 "answer": false, 338 "justification": "OPPO evaluates its own MiroFlow and MiroThinker systems alongside competitors. No acknowledgment of self-comparison bias. MiroFlow achieves the highest checklist pass rate (72.19%), which is not discussed as a potential bias." 339 }, 340 "compute_budget_vs_performance": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of compute budget differences between systems. Proprietary deep research APIs likely use vastly more compute than open-source models, but this is not analyzed." 344 }, 345 "benchmark_construct_validity": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether FINDER's checklist-based evaluation actually measures 'genuine usefulness' of deep research agents as claimed in the title. The construct validity gap between checklist pass rates and real-world utility is not addressed." 349 }, 350 "scaffold_confound_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Systems are compared with different scaffolding (proprietary APIs with unknown scaffolding vs. open-source models vs. agent frameworks), but the scaffold confound is not addressed. Performance differences are attributed to the systems rather than isolating model vs. scaffold effects." 354 } 355 } 356 }, 357 "claims": [ 358 { 359 "claim": "Over 39% of DRA failures arise in content generation, particularly through strategic content fabrication.", 360 "evidence": "Figure 3 shows generation category at 38.76%. SCF (Strategic Content Fabrication) is the largest single failure mode at 19.0%.", 361 "supported": "strong" 362 }, 363 { 364 "claim": "Retrieval-related failures account for over 32% of errors.", 365 "evidence": "Figure 3 shows retrieval category at 33.10%, with IIA (Insufficient External Information Acquisition) at 16.3% being the largest.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Current DRAs struggle not with task comprehension but with evidence integration, verification, and reasoning-resilient planning.", 370 "evidence": "Reasoning failures account for only 28.14% vs. retrieval 33.10% and generation 38.76%. FUR (requirements understanding) is only 10.55%.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "DEFT achieves strong inter-coder reliability between human and LLM annotations.", 375 "evidence": "Table 3 reports Krippendorff's alpha coefficients: OpenManus avg 0.8203, WebThinker avg 0.8526, both above the 0.80 threshold for strong reliability.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "FINDER enforces stronger constraints on reasoning transparency and source reliability than DRB.", 380 "evidence": "Figure 4 shows FACT scores generally decline under FINDER compared to DRB, except O3 citation effectiveness. RACE scores remain similar.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The taxonomy achieved theoretical saturation.", 385 "evidence": "Section 3.2.3: 36 held-out records from WebThinker and OpenManus were coded with no new categories emerging.", 386 "supported": "moderate" 387 } 388 ], 389 "methodology_tags": ["benchmark-eval", "qualitative"], 390 "key_findings": "The paper introduces FINDER, a 100-task benchmark with 419 structured checklists for evaluating deep research agents, and DEFT, a 14-category failure taxonomy built via grounded theory. Evaluation of 13 DRA systems shows generation failures dominate (39%), with strategic content fabrication being the most common single failure mode (19%). Gemini 2.5 Pro leads on RACE quality metrics while MiroFlow achieves the highest checklist pass rate. The taxonomy reveals that DRAs struggle less with task comprehension and more with evidence integration and verification.", 391 "red_flags": [ 392 { 393 "flag": "Conflict of interest: company evaluating its own product", 394 "detail": "OPPO AI Agent Team evaluates its own MiroFlow and MiroThinker systems alongside competitors. MiroFlow-English achieves the highest checklist pass rate (72.19%). No conflict of interest is disclosed." 395 }, 396 { 397 "flag": "Single-run evaluations for most systems", 398 "detail": "Only MiroFlow was evaluated across multiple runs (3). All other 12 systems appear to be single-run, making results subject to random variation in generation." 399 }, 400 { 401 "flag": "No significance testing", 402 "detail": "Performance rankings are based on raw score comparisons without any statistical tests. Small differences between systems (e.g., Gemini 50.95 vs Kimi 48.28) may not be meaningful." 403 }, 404 { 405 "flag": "No limitations section", 406 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries despite making broad claims about the state of deep research agents." 407 }, 408 { 409 "flag": "LLM-as-judge circularity", 410 "detail": "RACE uses Gemini 2.5 Pro as the judge LLM, and Gemini 2.5 Pro Deep Research is also one of the evaluated systems. FACT uses Gemini 2.5 Flash. The potential circularity of using Gemini to judge Gemini is not discussed." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "GAIA: a benchmark for general AI assistants", 416 "authors": ["Grégoire Mialon", "Clémentine Fourrier", "Thomas Wolf", "Yann LeCun", "Thomas Scialom"], 417 "year": 2023, 418 "relevance": "Major benchmark for evaluating general AI agent capabilities, relevant to agent evaluation methodology." 419 }, 420 { 421 "title": "Humanity's Last Exam", 422 "authors": ["Long Phan"], 423 "year": 2025, 424 "arxiv_id": "2501.14249", 425 "relevance": "Expert-level evaluation benchmark for AI, relevant to understanding benchmark difficulty and evaluation design." 426 }, 427 { 428 "title": "WebThinker: Empowering Large Reasoning Models with Deep Research Capability", 429 "authors": ["Xiaoxi Li"], 430 "year": 2025, 431 "arxiv_id": "2504.21776", 432 "relevance": "Open-source deep research agent evaluated in this study, relevant to agentic AI systems." 433 }, 434 { 435 "title": "Why do multi-agent LLM systems fail?", 436 "authors": ["Mert Cemri"], 437 "year": 2025, 438 "arxiv_id": "2503.13657", 439 "relevance": "Failure taxonomy for multi-agent LLM systems, directly comparable to DEFT taxonomy." 440 }, 441 { 442 "title": "OWL: Optimized Workforce Learning for General Multi-Agent Assistance in Real-World Task Automation", 443 "authors": ["Mengkang Hu"], 444 "year": 2025, 445 "arxiv_id": "2505.23885", 446 "relevance": "Multi-agent framework evaluated in this study, relevant to agentic AI architecture." 447 }, 448 { 449 "title": "OpenManus: An Open-Source Framework for Building General AI Agents", 450 "authors": ["Xinbin Liang"], 451 "year": 2025, 452 "relevance": "Open-source agent framework evaluated in this study." 453 }, 454 { 455 "title": "Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge", 456 "authors": ["Boyu Gou"], 457 "year": 2025, 458 "arxiv_id": "2506.21506", 459 "relevance": "Benchmark for agentic search with agent-as-judge evaluation framework." 460 }, 461 { 462 "title": "DeepResearchGym: A Free, Transparent, and Reproducible Evaluation Sandbox for Deep Research", 463 "authors": ["João Coelho"], 464 "year": 2025, 465 "arxiv_id": "2505.19253", 466 "relevance": "Sandbox environment for evaluating deep research agents with reproducible protocols." 467 }, 468 { 469 "title": "Kimi K2: Open Agentic Intelligence", 470 "authors": ["Kimi Team"], 471 "year": 2025, 472 "arxiv_id": "2507.20534", 473 "relevance": "Major open agentic model evaluated in this study." 474 }, 475 { 476 "title": "ChatDev: Communicative Agents for Software Development", 477 "authors": ["Wangchunshu Zhou"], 478 "year": 2023, 479 "relevance": "Multi-agent framework for software development, relevant to agentic AI coordination." 480 }, 481 { 482 "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents", 483 "authors": ["Jason Wei"], 484 "year": 2025, 485 "arxiv_id": "2504.12516", 486 "relevance": "Benchmark for browsing agents, relevant to web-based agent evaluation." 487 } 488 ] 489 }