ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (23197B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "How Far Are We from Genuinely Useful Deep Research Agents?",
      6     "authors": [
      7       "Dingling Zhang",
      8       "He Zhu",
      9       "Jincheng Ren",
     10       "Kangqi Song",
     11       "Xinran Zhou"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2512.01948",
     16     "doi": "10.48550/arXiv.2512.01948"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims are backed by paper content: 100 tasks with 419 checklist items (Section 3.1), DEFT's 14 failure modes (Table 2), and the 39%/32% failure distribution figures are confirmed by Figure 3 and Table 2 experimental data.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes causal-sounding claims (structured checklists 'enhance evaluation precision,' DRAs 'struggle with' specific capabilities) but the study design is observational benchmark evaluation without ablations or controlled experiments adequate for causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion broadly states 'current agents' struggle with evidence integration based on 13 evaluated systems and 100 tasks from a single source benchmark (DRB), without bounding these generalizations to the specific task types, domains, or conditions tested.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternative explanations for observed failure patterns—for example, whether RACE score consistency between DRB and FINDER reflects task difficulty confounds, evaluator bias from using Gemini as both reference-generator and judge, or benchmark-specific artifacts.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes RACE (relative quality against AI reference), FACT (citation accuracy), and checklist pass rate as measuring different aspects; Section 4.3 explicitly notes RACE measures relative not absolute performance, showing awareness of proxy measurement limitations.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section. Methodological constraints are mentioned inline (RACE relativity in Section 4.3, FACT failures in Appendix M) but not consolidated into a formal limitations discussion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are identified. The paper does not address whether expert checklist creation introduces systematic bias, whether Gemini-generated reference reports inflate Gemini's RACE scores, or whether the 100-task sample is representative of real deep research needs.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what its results do NOT show. There is no discussion of task types excluded from FINDER, DRA capabilities the benchmark cannot assess, or limits on how checklist scores translate to real-world research quality.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper. The work is attributed to OPPO AI Agent Team but no institutional funding, grants, or corporate research budget disclosure is provided.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper is explicitly attributed to 'OPPO AI Agent Team' with correspondence to oppo.com and nju.edu.cn addresses, making the primary organizational affiliations identifiable.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OPPO/MiroMind-affiliated systems (MiroFlow, MiroThinker) are among the evaluated DRAs while OPPO employees constructed the benchmark, creating a potential conflict where benchmark designers may have inadvertently favored evaluation criteria aligned with their own systems' strengths.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure is provided anywhere in the paper, despite the commercial context (OPPO is a major consumer electronics company) and the commercial DRA market being evaluated.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'Deep Research Agents' in the introduction, RACE and FACT frameworks in Appendix E, 'grounded theory' with methodology citations, and all 14 DEFT failure categories are formally defined in Appendix B with detailed case studies.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are enumerated at the end of the introduction: FINDER benchmark (100 tasks, 419 checklist items), DEFT failure taxonomy (14 modes across 3 dimensions), and experimental findings demonstrating current DRA weaknesses.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 explicitly compares FINDER against GAIA, HLE, Mind2Web 2, DeepResearchGym, DeepScholar-Bench, and DRBench, explaining how FINDER differs from each via structured checklists versus subjective metrics and expert-curated tasks versus LLM-generated tasks.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper asserts that expert-curated structured checklists improve evaluation of deep research quality but does not empirically validate that checklist scores correlate with independent human expert judgments of report quality; construct validity rests on expert authority alone.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Figure J.3 shows only the distribution of checklist items per query (3, 4, or 5), not the difficulty distribution of tasks. No easy/medium/hard tiers are defined, and model score variance is not analyzed as a proxy for task difficulty.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No ceiling or floor effect analysis is performed. Checklist pass rates range from 44.87% to 72.19%, RACE scores from ~30 to ~51—the range suggests no obvious ceiling, but this is not explicitly verified or discussed.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline is provided. Reference reports used in RACE evaluation are generated by Gemini-2.5-Pro Deep Research, not human researchers, so performance is benchmarked against an AI standard rather than human analyst capability.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "RACE/FACT metrics are inherited from DeepResearch Bench without independent justification. The novel cosine-based positive taxonomy metric (Appendix L) is described mathematically but its choice over simpler linear alternatives is not adequately motivated beyond aesthetic preference for curvature.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No contamination resistance measures are mentioned—no temporal splits, canary strings, or dynamic task generation. The 100 tasks derive from DeepResearch Bench PhD-level topics that may appear in model training data.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss whether FINDER will remain discriminative as DRA capabilities improve, whether benchmark tasks will be gamed, or any update plan for the task set or scoring criteria as the field advances.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "DEFT documents failure modes of DRAs, not of the benchmark itself. Appendix M addresses FACT technical failures (anti-scraping, fabricated URLs) but the paper does not systematically analyze what capabilities FINDER cannot measure or how agents could game it.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Code and data are released at github.com/OPPO-PersonalAI/FINDER_DEFT, and Appendix K provides detailed model configurations for all 13 evaluated systems, enabling others to reproduce reported numbers.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The paper describes construction methodology and provides query examples (Appendix A.2), but lacks a formal data card, does not detail preprocessing steps or quality control for LLM-based checklist refinement, and omits information about expert selection criteria.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper links to a GitHub repository but does not specify the license under which FINDER tasks and DEFT annotations are released, leaving downstream use terms and redistribution rights unclear.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "While the intended use (evaluating DRAs) is implicit, the paper does not specify what conclusions should NOT be drawn from benchmark results—such as whether FINDER scores generalize to real-world research utility or how domain imbalances affect score interpretation.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Over 39% of failures in deep research agents arise in content generation, particularly through strategic content fabrication (19%) where agents generate plausible but unsupported professional-looking content",
    203       "evidence": "Figure 3 and Table 2 report Generation failures at 38.76% with SCF as the single largest failure mode at 19.0%, derived from analysis of ~1,000 reports across 9 evaluated systems",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Current DRAs struggle not with task comprehension but with evidence integration, verification, and reasoning-resilient planning",
    208       "evidence": "Reasoning failures are lowest at 28.14% while Retrieval (33.10%) and Generation (38.76%) are higher; the paper explicitly frames this as 'reasoning resilience' vs. 'reasoning intensity' (Insight 1)",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "FINDER imposes stricter evaluation standards than DeepResearch Bench (DRB), exposing model weaknesses in factual consistency that DRB's original configuration obscures",
    213       "evidence": "Section 4.3 shows FACT scores diverge significantly between DRB and FINDER, with most systems experiencing declines in citation accuracy and effective citations under FINDER's harder prompts",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Gemini-2.5-Pro Deep Research is the top-performing system with RACE overall score 50.95 and balanced performance across all taxonomy dimensions (Savg 72.89)",
    218       "evidence": "Table 1 shows Gemini leading on RACE Overall, and achieving the highest Savg among all evaluated proprietary API systems",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "DEFT achieves strong inter-coder reliability with Krippendorff's alpha >0.8 between human expert and LLM coder pairs",
    223       "evidence": "Table 3 reports overall alpha of 0.8203 (OpenManus) and 0.8526 (WebThinker), with category-level coefficients ranging from 0.74 to 0.92",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "Retrieval failures account for 32%+ of errors with insufficient external information acquisition (IIA) as the largest retrieval failure mode at 16.3%",
    228       "evidence": "Figure 3 and Table 2 report IIA at 16.30% as the second-largest single failure mode across all three dimensions",
    229       "supported": "strong"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "qualitative"
    235   ],
    236   "key_findings": "FINDER introduces 100 expert-curated deep research tasks with 419 structured checklist items that impose stricter evaluation than existing QA-based benchmarks, particularly for citation fidelity and analytical depth. DEFT identifies 14 fine-grained failure modes across reasoning, retrieval, and generation dimensions, finding that strategic content fabrication (19.0%) and insufficient information acquisition (16.3%) are the most prevalent failure types—contrary to the assumption that task comprehension is the main bottleneck, DRAs demonstrate adequate understanding but fail catastrophically at evidence grounding and fabrication avoidance. Proprietary systems substantially outperform open-source alternatives on most dimensions, with Gemini-2.5-Pro leading overall and O3 Deep Research excelling on factual citation metrics. The benchmark reveals that superior DRA performance requires maintaining balance across understanding, evidence collection, and synthesis rather than overoptimizing any single stage.",
    237   "red_flags": [
    238     {
    239       "flag": "Benchmark-evaluator conflict of interest",
    240       "detail": "OPPO/MiroMind-affiliated systems (MiroFlow, MiroThinker) are evaluated using a benchmark created by the same team, and no conflict-of-interest statement is provided. Benchmark criteria may inadvertently favor evaluation dimensions aligned with the creators' systems."
    241     },
    242     {
    243       "flag": "LLM-as-gold-standard",
    244       "detail": "RACE evaluation uses Gemini-2.5-Pro-generated reports as the reference standard, inherently biasing relative scores toward Gemini-similar outputs. The benchmark measures quality against an AI gold standard rather than human expert work."
    245     },
    246     {
    247       "flag": "No human baseline",
    248       "detail": "The benchmark claims to evaluate 'analyst-level' report generation but provides no human expert performance baseline, making it impossible to calibrate how far any system is from actual human capability or to validate the benchmark's discriminative ceiling."
    249     },
    250     {
    251       "flag": "Construct validity unvalidated",
    252       "detail": "Checklist item validity rests solely on expert authority. No external validation study shows that checklist scores correlate with independent human judgments of report quality or downstream research utility."
    253     },
    254     {
    255       "flag": "No competing interests disclosure",
    256       "detail": "No funding source or competing interests statement is provided despite OPPO's commercial context, the presence of affiliated systems among those evaluated, and potential commercial interests in the DRA market outcomes."
    257     },
    258     {
    259       "flag": "Arbitrary scoring metric",
    260       "detail": "The novel cosine-based positive taxonomy metric is post-hoc justified by analogy to cosine similarity; the choice over simpler alternatives (e.g., linear inverse error rate) is not driven by measurement validity arguments."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "GAIA: a benchmark for general AI assistants",
    266       "relevance": "Foundational DRA evaluation benchmark that FINDER positions against; represents the closed-ended QA paradigm the authors critique"
    267     },
    268     {
    269       "title": "DeepResearch Bench: A comprehensive benchmark for deep research agents",
    270       "relevance": "Direct predecessor that FINDER extends with structured checklists and refined prompts; FINDER's 100 tasks derive from DRB's task set"
    271     },
    272     {
    273       "title": "Mind2Web 2: Evaluating agentic search with agent-as-a-judge",
    274       "relevance": "Contemporary open-ended DRA benchmark using Agent-as-Judge framework; key comparison point for FINDER's evaluation approach"
    275     },
    276     {
    277       "title": "DeepResearchGym: A free, transparent, and reproducible evaluation sandbox for deep research",
    278       "relevance": "Contemporary DRA benchmark emphasizing reproducibility with standardized search APIs; positioned as complementary to FINDER"
    279     },
    280     {
    281       "title": "DRBench: A realistic benchmark for enterprise deep research",
    282       "relevance": "Enterprise-focused DRA benchmark using citation-grounded assessment; shares FINDER's focus on long-form analytical reports"
    283     },
    284     {
    285       "title": "Why do multi-agent LLM systems fail?",
    286       "relevance": "Prior failure taxonomy work (Cemri et al.) whose seed concepts and findings directly inform DEFT's open coding process"
    287     },
    288     {
    289       "title": "WebThinker: Empowering large reasoning models with deep research capability",
    290       "relevance": "Open-source DRA system evaluated in FINDER experiments; also used as one of two validation systems for DEFT's theoretical saturation check"
    291     },
    292     {
    293       "title": "Deep research agents: A systematic examination and roadmap",
    294       "relevance": "Survey of DRA systems and capabilities that provides context for the benchmark's scope and the field's current state"
    295     }
    296   ],
    297   "engagement_factors": {
    298     "practical_relevance": {
    299       "score": 2,
    300       "justification": "FINDER provides DRA developers a structured evaluation tool with released code and data, directly applicable to benchmarking new systems against a multi-dimensional quality rubric."
    301     },
    302     "surprise_contrarian": {
    303       "score": 2,
    304       "justification": "Challenges the assumption that DRAs fail primarily at task understanding; demonstrates fabrication (not retrieval or comprehension) is the #1 failure mode, reframing the field's optimization target."
    305     },
    306     "fear_safety": {
    307       "score": 2,
    308       "justification": "Strategic Content Fabrication at 19% of all failures—AI confidently generating plausible but unfounded academic content—raises concrete concerns about reliability of AI-generated research in high-stakes contexts."
    309     },
    310     "drama_conflict": {
    311       "score": 1,
    312       "justification": "Competitive comparison across Gemini, OpenAI, Perplexity, and open-source systems reveals clear performance gaps, but the framing is measured and academic rather than confrontational."
    313     },
    314     "demo_ability": {
    315       "score": 2,
    316       "justification": "Code and data released at github.com/OPPO-PersonalAI/FINDER_DEFT enables practitioners to run FINDER evaluation on their own DRA systems immediately."
    317     },
    318     "brand_recognition": {
    319       "score": 2,
    320       "justification": "Evaluates Gemini-2.5-Pro, OpenAI O3/O4-Mini Deep Research, and Perplexity; uses Claude Opus 4.1 as one of the five LLM coders in taxonomy construction; OPPO is a globally recognized consumer electronics brand."
    321     }
    322   },
    323   "hn_data": {
    324     "threads": [
    325       {
    326         "hn_id": "38869223",
    327         "title": "Show HN: RAGatouille, a simple lib to use&train top retrieval models in RAG apps",
    328         "points": 15,
    329         "comments": 5,
    330         "url": "https://news.ycombinator.com/item?id=38869223",
    331         "created_at": "2024-01-04T16:48:34Z"
    332       },
    333       {
    334         "hn_id": "46439655",
    335         "title": "Stable-Pretraining-v1: Foundation Model Research Made Simple",
    336         "points": 7,
    337         "comments": 0,
    338         "url": "https://news.ycombinator.com/item?id=46439655",
    339         "created_at": "2025-12-30T23:51:39Z"
    340       },
    341       {
    342         "hn_id": "43063462",
    343         "title": "The hierarchy in HNSW is not necessary in high dimensions",
    344         "points": 5,
    345         "comments": 1,
    346         "url": "https://news.ycombinator.com/item?id=43063462",
    347         "created_at": "2025-02-15T23:19:07Z"
    348       },
    349       {
    350         "hn_id": "33870608",
    351         "title": "Clustering – Basic Concepts and Methods",
    352         "points": 3,
    353         "comments": 0,
    354         "url": "https://news.ycombinator.com/item?id=33870608",
    355         "created_at": "2022-12-05T19:30:19Z"
    356       },
    357       {
    358         "hn_id": "45665498",
    359         "title": "A Fine-Grained Purpose-Based Access Control System for Large Data Warehouses",
    360         "points": 2,
    361         "comments": 0,
    362         "url": "https://news.ycombinator.com/item?id=45665498",
    363         "created_at": "2025-10-22T06:24:49Z"
    364       },
    365       {
    366         "hn_id": "42962673",
    367         "title": "Paradoxical Behavior in Collatz Sequences",
    368         "points": 2,
    369         "comments": 0,
    370         "url": "https://news.ycombinator.com/item?id=42962673",
    371         "created_at": "2025-02-06T14:33:11Z"
    372       },
    373       {
    374         "hn_id": "46504193",
    375         "title": "Evolution Without an Oracle: Driving Effective Evolution with LLM Judges",
    376         "points": 2,
    377         "comments": 0,
    378         "url": "https://news.ycombinator.com/item?id=46504193",
    379         "created_at": "2026-01-05T20:12:58Z"
    380       },
    381       {
    382         "hn_id": "42517812",
    383         "title": "Improving feature interactions at Pinterest under industry constraints",
    384         "points": 1,
    385         "comments": 0,
    386         "url": "https://news.ycombinator.com/item?id=42517812",
    387         "created_at": "2024-12-26T21:05:19Z"
    388       },
    389       {
    390         "hn_id": "29450974",
    391         "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
    392         "points": 1,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=29450974",
    395         "created_at": "2021-12-05T17:36:32Z"
    396       }
    397     ],
    398     "top_points": 15,
    399     "total_points": 38,
    400     "total_comments": 6
    401   }
    402 }

Impressum · Datenschutz