scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21856B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "DatasetResearch: Benchmarking Agent Systems for Demand-Driven Dataset Discovery",
      6     "authors": [
      7       "Keyu Li",
      8       "Mohan Jiang",
      9       "Dayuan Fu",
     10       "Yunze Wu",
     11       "Xiangkun Hu"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2508.06960",
     16     "doi": "10.48550/arXiv.2508.06960"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Key abstract claims are supported: 22% score on DatasetResearch-pro is confirmed in Figure 5 (OpenAI DeepResearch scores 0.2218), the search/synthesis dichotomy is demonstrated in Table 2 (search best at knowledge 41.89%, synthesis best at reasoning 72.70%), and corner case failures are illustrated in Figure 8.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper attributes synthesis agents' reasoning advantage to 'generating richly detailed data with explicit thought processes,' but this is an interpretive post-hoc explanation based on correlation in benchmark scores, not an ablation controlling for factors like model size, training data, or retrieval index differences.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The abstract claims to 'illuminate the path toward AI systems capable of finding any dataset in the digital universe,' but the benchmark covers only 208 NLP tasks from two structured platforms (HuggingFace, PapersWithCode), six task categories, and text-only modality — far narrower than the generalization implies.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes search agents' knowledge advantage entirely to 'retrieval breadth' and synthesis agents' reasoning advantage to 'structured generation,' without considering alternative explanations such as training data distribution differences, model capability differences unrelated to architecture, or evaluation metric sensitivity.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes metadata similarity scores (what is measured) from downstream fine-tuning performance (practical utility), and explains the normalized scoring formula (Seval/Sref) with discussion of why normalization is needed across heterogeneous task metrics.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7.2 'Limitation and Future Work' is a substantive section discussing three specific limitations: structured-repository scope, reliance on closed-source models, and the need for hybrid agents.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations discuss scope (HuggingFace/PapersWithCode only) and model type (closed-source) but omit major threats: the circular use of o3 to generate reference metadata and then score metadata alignment, selection bias in the pro subset (selected by GPT-4o-search failure), and using only LLaMA-3.1-8B as the fine-tuning evaluation model.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states scope is limited to text-only modality, six NLP task categories, and datasets from HuggingFace and PapersWithCode — and the limitations section calls these out as explicit boundaries.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No acknowledgment or funding disclosure section appears anywhere in the paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are disclosed on the first page: Shanghai Jiao Tong University, SII, and GAIR.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interest declaration appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 3.1 formally defines 'data discovery,' MetaTriplet components (demand description, reference set, reference metadata), and Section 3.2 defines the knowledge-based vs. reasoning-based distinction with operational criteria.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction lists four explicit bullet-point contributions: the benchmark itself, the evaluation methodology, experimental results on state-of-the-art systems, and systematic failure mode analysis.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages with Viswanathan et al. [2023], Walker et al. [2023], and Gandhi et al. [2024], explaining how this benchmark extends beyond prior dataset recommendation work by evaluating full agent systems and including downstream fine-tuning performance.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper asserts the MetaTriplet framework measures dataset discovery quality but does not rigorously argue why metadata alignment + fine-tuning performance captures this construct; critically, o3 generates reference metadata and also judges alignment scores, creating a circularity the paper dismisses as a feature rather than a validity threat.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Difficulty is operationalized only as a binary (regular vs. pro), where the pro subset is selected based on GPT-4o-search failure rates — not via independent difficulty modeling; no easy/medium/hard tiers or difficulty measurement independent of model performance is provided.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The results show no ceiling effects (top agent scores 22% on pro, 73% on main benchmark reasoning tasks), and the use of gated datasets was explicitly designed to prevent trivial search solutions; corner cases near floor (0%) are documented in Section 6.3.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline is included; the paper uses zero-shot LLaMA-3.1-8B as a performance floor but never measures how a human researcher would perform at dataset discovery under the same demand descriptions.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "The normalized score formula (Seval/Sref) is explained and the rationale for multiple metrics across task types is given, but the choice of LLaMA-3.1-8B as the sole fine-tuning model is not justified, and the o3-as-judge metadata scoring is circular since o3 also generated the reference metadata.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "The benchmark deliberately uses HuggingFace 'gated' datasets requiring manual approval, which prevents search agents from directly downloading and using the reference datasets — an explicit contamination resistance design described in Section 3.2 Step 1.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "There is no discussion of whether gated datasets may become ungated, whether HuggingFace availability will persist, or whether the benchmark will require updates as agent capabilities evolve; Section 7.2 covers future work but not benchmark longevity.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Section 6.3 explicitly discusses benchmark failure modes ('corner cases' outside existing data distributions where all methods fail), and Section 7.2 acknowledges the benchmark does not cover unstructured web sources or non-text modalities.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "While code is available on GitHub and Appendix D provides fine-tuning configs, the deep research agent experiments 'necessitate a human-in-the-loop approach' because the tools lack API access — making those baseline results not independently reproducible.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The 7-step curation pipeline is described in detail (Section 3.2), metadata components are explicitly defined (Appendix A), filtering criteria are specified, and the benchmark is available on GitHub with full prompts in the Appendix.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "The paper provides a GitHub URL but specifies no license for the benchmark; the underlying reference datasets from HuggingFace are gated (intentionally access-restricted), creating ambiguity about whether others can actually use or redistribute the benchmark.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The paper specifies intended use (evaluating demand-driven dataset discovery agents) but does not specify what should NOT be concluded from results — for instance, that scores on 6 NLP task categories should not generalize to multimodal or scientific discovery tasks.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Even the most advanced deep research systems achieve only 22% score on DatasetResearch-pro",
    203       "evidence": "Figure 5 and Section 5.2 report OpenAI DeepResearch achieves 0.2218 on the pro subset fine-tuning evaluation",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Search agents excel at knowledge-intensive tasks while synthesis agents dominate reasoning tasks",
    208       "evidence": "Table 2 shows GPT-4o-search fine-tuning score of 41.89% on knowledge vs. OpenAI o3 w/ref score of 72.70% on reasoning tasks",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "All current agent methods catastrophically fail on corner cases outside existing data distributions",
    213       "evidence": "Figure 8 (Section 6.3) shows near-zero fine-tuning performance for all agents on one corner case; synthesis agent scores 0.0, search agent 0.067",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Few-shot evaluation results maintain consistent relative trends with fine-tuning, making 3-shot a practical proxy",
    218       "evidence": "Table 2 shows 1/3/5-shot trends largely mirror fine-tuning rankings; Section 5.2 argues 3-shot is the most stable setting",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Synthesis agents outperform search agents specifically due to their ability to generate structured, reasoning-aligned output data",
    223       "evidence": "Figure 6 case study shows o3 synthesis generates structured reasoning examples; metadata evaluation shows synthesis agents score 8.69 avg vs. search agents 5.71 avg",
    224       "supported": "weak"
    225     },
    226     {
    227       "claim": "DatasetResearch is the first comprehensive benchmark for demand-driven dataset discovery",
    228       "evidence": "Prior work (Viswanathan et al. 2023, Walker et al. 2023) is positioned as partial exploration not evaluating full agent systems with downstream fine-tuning",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval"
    234   ],
    235   "key_findings": "DatasetResearch establishes a 208-task benchmark for demand-driven dataset discovery across six NLP task categories, revealing that even state-of-the-art deep research systems achieve only 22% normalized score on the challenging pro subset. A clear specialization emerges: search agents achieve best knowledge-task performance (41.89% fine-tuning) while synthesis agents dominate reasoning tasks (72.70%). All evaluated methods fail catastrophically on corner cases outside existing data distributions, and the benchmark reveals that fine-tuning with synthetic data outperforms retrieval-based discovery for reasoning tasks by a large margin.",
    236   "red_flags": [
    237     {
    238       "flag": "Circular evaluation via o3",
    239       "detail": "OpenAI o3 is used to generate reference metadata, generate demand descriptions, AND serve as the judge for metadata similarity scoring — the paper acknowledges this but frames it as bias mitigation rather than a validity threat."
    240     },
    241     {
    242       "flag": "Deep research results not reproducible",
    243       "detail": "Deep research agent experiments (OpenAI DeepResearch, Grok DeepResearch, Gemini DeepResearch) require manual human-in-the-loop curation because 'these deep research tools are not currently accessible via API calls' — these results cannot be independently reproduced."
    244     },
    245     {
    246       "flag": "Pro subset selection bias",
    247       "detail": "DatasetResearch-pro was selected by identifying the 20 tasks where GPT-4o-search-preview scored lowest, creating a biased hard subset that is specifically challenging for that agent and may not represent general difficulty."
    248     },
    249     {
    250       "flag": "Single fine-tuning model",
    251       "detail": "All downstream task performance is evaluated by fine-tuning only LLaMA-3.1-8B with a fixed configuration; it is unclear whether results generalize to other model sizes or architectures."
    252     },
    253     {
    254       "flag": "No human baseline",
    255       "detail": "No human researcher benchmark for dataset discovery is provided, making it impossible to assess whether the 22% top-agent score represents near-human, far-below-human, or superhuman performance in practical terms."
    256     },
    257     {
    258       "flag": "Synthesis advantage may reflect o3 self-evaluation",
    259       "detail": "Synthesis agents primarily use o3, which also generates the reference metadata used in scoring; this may systematically inflate synthesis agents' metadata alignment scores compared to search agents using different models."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "DataFinder: Scientific Dataset Recommendation from Natural Language Descriptions",
    265       "relevance": "Direct predecessor: bi-encoder retriever recommending datasets from natural language research descriptions; DatasetResearch extends this to full agent evaluation with downstream fine-tuning"
    266     },
    267     {
    268       "title": "Prompting Datasets: Data Discovery with Conversational Agents",
    269       "relevance": "Prior work on using conversational LLMs for data discovery, including hallucination risks — motivates need for rigorous benchmark"
    270     },
    271     {
    272       "title": "Better Synthetic Data by Retrieving and Transforming Existing Datasets",
    273       "relevance": "Dataset transformation/synthesis prior work that DatasetResearch evaluates as a baseline approach"
    274     },
    275     {
    276       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    277       "relevance": "Cited as example of benchmark for evaluating code agents; provides methodological context for agent benchmarking"
    278     },
    279     {
    280       "title": "DeepResearcher: Scaling Deep Research via Reinforcement Learning in Real-World Environments",
    281       "relevance": "Related deep research agent system; authors include overlapping researchers from GAIR lab"
    282     },
    283     {
    284       "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents",
    285       "relevance": "Related benchmark for agent web-browsing capabilities; comparator in the agent evaluation space"
    286     },
    287     {
    288       "title": "SWE-Smith: Scaling Data for Software Engineering Agents",
    289       "relevance": "Related work on data synthesis for agent training; demonstrates the general importance of data curation for agent performance"
    290     },
    291     {
    292       "title": "DiscoveryBench: Towards Data-Driven Discovery with Large Language Models",
    293       "relevance": "Related benchmark for data-driven scientific discovery agents; related evaluation space"
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 3,
    299       "justification": "Directly addresses a real bottleneck in AI development workflows — finding training data — with a public benchmark practitioners can use to evaluate discovery tools."
    300     },
    301     "surprise_contrarian": {
    302       "score": 1,
    303       "justification": "The finding that state-of-the-art deep research agents achieve only 22% is notable, but the search-vs-synthesis dichotomy and corner case failures are unsurprising given known limitations of each approach."
    304     },
    305     "fear_safety": {
    306       "score": 0,
    307       "justification": "No safety or risk concerns raised; the paper is about dataset curation efficiency, not harmful capabilities."
    308     },
    309     "drama_conflict": {
    310       "score": 0,
    311       "justification": "No controversy or conflict angle; the paper evaluates commercial systems matter-of-factly without provocative framing."
    312     },
    313     "demo_ability": {
    314       "score": 2,
    315       "justification": "The benchmark is publicly available on GitHub, and search/synthesis agents can be run against it, though deep research agents require manual curation."
    316     },
    317     "brand_recognition": {
    318       "score": 2,
    319       "justification": "GAIR lab (Pengfei Liu) is a recognized NLP research group; the paper also evaluates prominent commercial systems from OpenAI, Google, and xAI."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "43014573",
    326         "title": "Time to act on the risk of efficient personalized text generation",
    327         "points": 57,
    328         "comments": 34,
    329         "url": "https://news.ycombinator.com/item?id=43014573",
    330         "created_at": "2025-02-11T16:14:03Z"
    331       },
    332       {
    333         "hn_id": "45234790",
    334         "title": "Reverse-Engineered Reasoning for Open-Ended Generation",
    335         "points": 4,
    336         "comments": 1,
    337         "url": "https://news.ycombinator.com/item?id=45234790",
    338         "created_at": "2025-09-13T19:49:08Z"
    339       },
    340       {
    341         "hn_id": "45184326",
    342         "title": "Reasoning Traces from QA Pairs",
    343         "points": 3,
    344         "comments": 1,
    345         "url": "https://news.ycombinator.com/item?id=45184326",
    346         "created_at": "2025-09-09T16:25:03Z"
    347       },
    348       {
    349         "hn_id": "44516439",
    350         "title": "Amazon gets serious with AI Safety",
    351         "points": 3,
    352         "comments": 0,
    353         "url": "https://news.ycombinator.com/item?id=44516439",
    354         "created_at": "2025-07-10T01:50:50Z"
    355       },
    356       {
    357         "hn_id": "45226714",
    358         "title": "Are ArXiv submissions on Wednesday better cited?",
    359         "points": 2,
    360         "comments": 0,
    361         "url": "https://news.ycombinator.com/item?id=45226714",
    362         "created_at": "2025-09-12T21:00:07Z"
    363       },
    364       {
    365         "hn_id": "44889206",
    366         "title": "Large Language Models Do Not Simulate Human Psychology",
    367         "points": 1,
    368         "comments": 0,
    369         "url": "https://news.ycombinator.com/item?id=44889206",
    370         "created_at": "2025-08-13T14:50:01Z"
    371       },
    372       {
    373         "hn_id": "32619543",
    374         "title": "Angle-agnostic cloaking from person-tracking systems with a t-shirt",
    375         "points": 1,
    376         "comments": 1,
    377         "url": "https://news.ycombinator.com/item?id=32619543",
    378         "created_at": "2022-08-27T14:42:49Z"
    379       },
    380       {
    381         "hn_id": "44521323",
    382         "title": "Evaluating the Critical Risks of Amazon’s Nova Premier",
    383         "points": 1,
    384         "comments": 0,
    385         "url": "https://news.ycombinator.com/item?id=44521323",
    386         "created_at": "2025-07-10T14:11:52Z"
    387       },
    388       {
    389         "hn_id": "42705257",
    390         "title": "What Hawking Radiation Looks Like as You Fall into a Black Hole",
    391         "points": 1,
    392         "comments": 0,
    393         "url": "https://news.ycombinator.com/item?id=42705257",
    394         "created_at": "2025-01-14T23:16:02Z"
    395       }
    396     ],
    397     "top_points": 57,
    398     "total_points": 73,
    399     "total_comments": 37
    400   }
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs