scan-v4.json (20469B)
1 { 2 "scan_version": 4, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "DatasetResearch: Benchmarking Agent Systems for Demand-Driven Dataset Discovery", 6 "authors": [ 7 "Keyu Li", 8 "Mohan Jiang", 9 "Dayuan Fu", 10 "Yunze Wu", 11 "Xiangkun Hu" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2508.06960", 16 "doi": "10.48550/arXiv.2508.06960" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims of 22% max on pro subset, search/synthesis dichotomy, and corner case failures are all supported by Tables 2, Figure 5, and Section 6 analysis.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper claims synthesis agents' 'advantage stems from their ability to generate reasoning-rich, more instruction-aligned output data' — this is a causal claim from observational comparison without controlled experiment isolating this factor.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The abstract claims this benchmarks 'AI agents' ability to discover and synthesize datasets' generally, but results are limited to NLP text tasks with LLaMA-3.1-8B as the only evaluation model. Title claims 'demand-driven dataset discovery' broadly.", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "No discussion of alternative explanations for the search/synthesis performance gap. Could be prompt design, dataset formatting by o3, or LLaMA-3.1-8B's specific characteristics rather than fundamental agent properties.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper measures fine-tuning performance on LLaMA-3.1-8B as a proxy for 'dataset quality' and 'dataset discovery capability' without discussing whether this proxy captures what matters. A dataset could be valuable for other models or purposes.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 7.2 'Limitation and Future Work' discusses three specific limitations: web-scale curation, open-source model evaluation, and hybrid agent approaches.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 7.2 discusses future work directions rather than specific threats to the current study's validity. No mention of LLM-as-judge bias, single evaluation model limitation, or prompt sensitivity.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "Section 7.2 mentions the benchmark 'relies on datasets from structured repositories' but does not explicitly bound claims. The paper's sweeping language ('finding any dataset in the digital universe') is not bounded.", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding or acknowledgments section found in the paper.", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations listed: Shanghai Jiao Tong University, SII, GAIR.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "No funding information disclosed, so independence cannot be assessed.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial disclosure statement in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 3.1 formally defines 'data discovery,' 'DataResearcher,' and 'MetaTriplet'; knowledge-based vs. reasoning-based categories are defined with explicit criteria in Section 3.2.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four explicit bullet-point contributions are listed at the end of Section 1: the benchmark itself, the evaluation methodology, experimental findings, and failure mode analysis.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 explicitly positions DATASETRESEARCH against Viswanathan et al. 2023, Walker et al. 2023, and Gandhi et al. 2024, explaining why prior approaches 'do not fully harness the deep reasoning and inferential capabilities enabled by modern LLMs.'", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "benchmark-creation": { 120 "construct_design": { 121 "construct_validity_argued": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper asserts the benchmark measures 'demand-driven dataset discovery capability' but never argues why downstream LLaMA-3.1-8B fine-tuning performance on 6 NLP tasks is a valid proxy for this capability — the connection between the proxy metric and the claimed construct is assumed rather than justified.", 125 "source": "haiku" 126 }, 127 "difficulty_distribution_characterized": { 128 "applies": true, 129 "answer": false, 130 "justification": "The only difficulty characterization is a binary knowledge/reasoning split and a post-hoc hard subset (DatasetResearch-pro) selected by GPT-4o-search worst-performing tasks; no formal difficulty tiers, IRT analysis, or difficulty measurement is provided.", 131 "source": "haiku" 132 }, 133 "ceiling_floor_effects_checked": { 134 "applies": true, 135 "answer": true, 136 "justification": "Step 2 of curation explicitly excludes tasks 'where baseline model performance had already reached near-saturation levels,' and the pro subset was created specifically because standard tasks showed insufficient discrimination.", 137 "source": "haiku" 138 }, 139 "human_baseline_included": { 140 "applies": true, 141 "answer": false, 142 "justification": "No human baseline is reported; while deep research agents required human-in-the-loop execution, the humans' performance as data researchers is not measured or reported.", 143 "source": "haiku" 144 }, 145 "scoring_rubric_justified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The normalized score formula (Seval/Sref) is explained pragmatically for cross-task comparability, but the choice to use LLaMA-3.1-8B as the evaluation model is unjustified, and the metadata scoring by o3 is circular since o3 also generated the reference metadata.", 149 "source": "haiku" 150 } 151 }, 152 "robustness": { 153 "contamination_resistance_designed": { 154 "applies": true, 155 "answer": true, 156 "justification": "The benchmark explicitly selects 'gated' HuggingFace datasets requiring manual approval for access, directly preventing search agents from automatically downloading reference datasets even if identified.", 157 "source": "haiku" 158 }, 159 "temporal_robustness_discussed": { 160 "applies": true, 161 "answer": false, 162 "justification": "Future work mentions extending to unstructured web data but does not discuss whether or how the benchmark will be updated as model capabilities evolve or whether current tasks will become trivial.", 163 "source": "haiku" 164 }, 165 "failure_modes_discussed": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 6.3 explicitly analyzes 'corner cases' as a failure mode of all methods, explaining that both search and synthesis are fundamentally limited by training data distributions.", 169 "source": "haiku" 170 }, 171 "baseline_implementations_provided": { 172 "applies": true, 173 "answer": true, 174 "justification": "The GitHub repository (GAIR-NLP/DatasetResearch) is linked, search/synthesis/deep-research agent implementations are described in detail with prompts in Appendix C, and fine-tuning configs are provided in Appendix D.", 175 "source": "haiku" 176 } 177 }, 178 "documentation": { 179 "dataset_documentation_complete": { 180 "applies": true, 181 "answer": true, 182 "justification": "The 7-step curation pipeline for HuggingFace datasets is documented in detail, metadata schema (6 dimensions) is specified in Appendix A, and collection methodology from PapersWithCode is described analogously.", 183 "source": "haiku" 184 }, 185 "licensing_and_access_clear": { 186 "applies": true, 187 "answer": false, 188 "justification": "The GitHub URL is provided but no license for the benchmark itself is stated; the reference datasets remain gated/restricted, which limits replicability without separately obtaining HF access to each underlying dataset.", 189 "source": "haiku" 190 }, 191 "intended_use_specified": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 3.1 clearly defines the intended use (evaluating AI agents on demand-driven dataset discovery and synthesis), and limitations Section 7.2 bounds the scope to NLP text tasks from structured repositories.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "Even the most advanced deep research systems achieve only 22% score on the DatasetResearch-pro subset.", 203 "evidence": "Figure 5 and Section 5.2 report OpenAI DeepResearch achieving 0.2218 on DatasetResearch-pro across few-shot and fine-tuning settings.", 204 "supported": "strong" 205 }, 206 { 207 "claim": "Search agents excel at knowledge-based tasks while synthesis agents dominate reasoning-based tasks.", 208 "evidence": "Table 2 shows GPT-4o-search achieves 41.89% fine-tuning on knowledge vs. 27.54% on reasoning; OpenAI o3 w/ ref achieves 38.98% on knowledge vs. 72.70% on reasoning.", 209 "supported": "strong" 210 }, 211 { 212 "claim": "All current agent methodologies fail on corner cases outside existing data distributions.", 213 "evidence": "Section 6.3 presents a case study where all methods (search, synthesis, deep research) score near 0% on a niche medical coding dataset, compared to a 91.7% reference ceiling.", 214 "supported": "moderate" 215 }, 216 { 217 "claim": "Few-shot evaluation (especially 3-shot) serves as a reliable proxy for fine-tuning performance.", 218 "evidence": "Section 5.2 observes 'consistent relative trends' between few-shot and fine-tuning results; 3-shot is claimed most stable, but no correlation coefficient is reported.", 219 "supported": "weak" 220 }, 221 { 222 "claim": "Providing a reference sample to the synthesis agent (o3 w/ ref) improves performance over w/o ref.", 223 "evidence": "Table 2 shows fine-tuning scores of 72.70% (w/ ref) vs. 67.25% (w/o ref) on reasoning tasks and 38.98% vs. 37.94% on knowledge tasks.", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "Deep research methodology retrieves significantly higher quality data than single-shot search agents on DatasetResearch-pro.", 228 "evidence": "Figure 5 shows OpenAI DeepResearch outperforming GPT-4o-mini-search on DatasetResearch-pro; Figure 7 case study shows DeepResearch (0.19) vs. search (0.005) and synthesis (0.05) on a Vietnamese exam task.", 229 "supported": "moderate" 230 } 231 ], 232 "methodology_tags": [ 233 "benchmark-eval" 234 ], 235 "key_findings": "DATASETRESEARCH establishes the first benchmark for demand-driven dataset discovery, revealing that current AI agents fall far short of the goal: even the best deep research systems score only 22% on the hard DatasetResearch-pro subset. A fundamental performance dichotomy emerges: search agents excel at knowledge-intensive tasks via retrieval breadth while synthesis agents dominate reasoning tasks via structured generation, yet both fail catastrophically on corner cases outside existing data distributions. The benchmark demonstrates that automated data curation remains an unsolved problem, with fine-tuning on discovered datasets achieving at most 73% of the reference performance on reasoning tasks and 42% on knowledge tasks.", 236 "red_flags": [ 237 { 238 "flag": "Circular metadata evaluation", 239 "detail": "OpenAI o3 generates the reference metadata and also serves as the judge for metadata similarity scoring — the paper claims this 'systematically mitigates potential scoring biases' but the opposite is likely true: the judge will reward data matching o3's generation style." 240 }, 241 { 242 "flag": "Post-hoc hard subset selection", 243 "detail": "DatasetResearch-pro is constructed by selecting the 20 tasks where GPT-4o-search performed worst in preliminary experiments, then advanced systems are evaluated on this same subset — this introduces selection bias toward tasks that specifically challenge retrieval-based approaches." 244 }, 245 { 246 "flag": "Human-in-the-loop evaluation inconsistency", 247 "detail": "Deep research tools required manual human execution and curation of results, introducing uncontrolled human judgment into the evaluation process that is not present for search and synthesis agents." 248 }, 249 { 250 "flag": "Single small evaluation model", 251 "detail": "All downstream task performance is measured by fine-tuning LLaMA-3.1-8B specifically, with no validation that results generalize to other model families or scales." 252 }, 253 { 254 "flag": "Overreaching benchmark scope claims", 255 "detail": "The abstract claims to 'illuminate the path toward AI systems capable of finding any dataset in the digital universe' based on 208 tasks across only 6 NLP categories from two structured repositories — this is a severe generalization overreach." 256 }, 257 { 258 "flag": "No human performance baseline", 259 "detail": "The benchmark lacks any measurement of how humans perform as DataResearchers, making it impossible to contextualize agent performance relative to human capability on the same tasks." 260 } 261 ], 262 "cited_papers": [ 263 { 264 "title": "DataFinder: Scientific Dataset Recommendation from Natural Language Descriptions", 265 "relevance": "Direct predecessor work on dataset discovery via natural language queries; DATASETRESEARCH explicitly builds beyond this approach" 266 }, 267 { 268 "title": "Prompting Datasets: Data Discovery with Conversational Agents", 269 "relevance": "Prior work on conversational AI for data discovery; identified LLM hallucination of non-existent datasets as a key challenge" 270 }, 271 { 272 "title": "Better Synthetic Data by Retrieving and Transforming Existing Datasets", 273 "relevance": "Prior work on dataset synthesis/transformation that DATASETRESEARCH extends with more rigorous agent evaluation" 274 }, 275 { 276 "title": "ScienceAgentBench: Toward Rigorous Assessment of Language Agents for Data-Driven Scientific Discovery", 277 "relevance": "Related benchmark for scientific agent evaluation; helps situate this work in the agent benchmarking landscape" 278 }, 279 { 280 "title": "DiscoveryBench: Towards Data-Driven Discovery with Large Language Models", 281 "relevance": "Related benchmark for LLM-assisted scientific discovery from data" 282 }, 283 { 284 "title": "DeepResearcher: Scaling Deep Research via Reinforcement Learning in Real-World Environments", 285 "relevance": "Deep research agent methodology directly evaluated in this benchmark" 286 }, 287 { 288 "title": "SWE-Smith: Scaling Data for Software Engineering Agents", 289 "relevance": "Example of dataset construction for agent training; motivates demand-driven dataset discovery" 290 }, 291 { 292 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 293 "relevance": "Foundational work for LLM-as-judge evaluation methodology used in metadata scoring" 294 } 295 ], 296 "engagement_factors": { 297 "practical_relevance": { 298 "score": 1, 299 "justification": "Benchmarks dataset discovery agents but doesn't provide a usable tool — practitioners can't directly apply this to their workflows." 300 }, 301 "surprise_contrarian": { 302 "score": 1, 303 "justification": "The 22% ceiling for deep research systems is notable but 'AI struggles on hard benchmark' is a familiar narrative, not a contrarian finding." 304 }, 305 "fear_safety": { 306 "score": 0, 307 "justification": "No safety, security, or risk angle whatsoever." 308 }, 309 "drama_conflict": { 310 "score": 0, 311 "justification": "No controversy or conflict — straightforwardly evaluates systems without challenging specific company claims." 312 }, 313 "demo_ability": { 314 "score": 1, 315 "justification": "Code is public on GitHub but reproducing requires fine-tuning LLaMA-3.1-8B, multiple API keys, and significant compute." 316 }, 317 "brand_recognition": { 318 "score": 1, 319 "justification": "From Shanghai Jiao Tong University/GAIR — recognized in NLP but not a household name in broader tech circles." 320 } 321 }, 322 "hn_data": { 323 "threads": [ 324 { 325 "hn_id": "43014573", 326 "title": "Time to act on the risk of efficient personalized text generation", 327 "points": 57, 328 "comments": 34, 329 "url": "https://news.ycombinator.com/item?id=43014573", 330 "created_at": "2025-02-11T16:14:03Z" 331 }, 332 { 333 "hn_id": "45234790", 334 "title": "Reverse-Engineered Reasoning for Open-Ended Generation", 335 "points": 4, 336 "comments": 1, 337 "url": "https://news.ycombinator.com/item?id=45234790", 338 "created_at": "2025-09-13T19:49:08Z" 339 }, 340 { 341 "hn_id": "45184326", 342 "title": "Reasoning Traces from QA Pairs", 343 "points": 3, 344 "comments": 1, 345 "url": "https://news.ycombinator.com/item?id=45184326", 346 "created_at": "2025-09-09T16:25:03Z" 347 }, 348 { 349 "hn_id": "44516439", 350 "title": "Amazon gets serious with AI Safety", 351 "points": 3, 352 "comments": 0, 353 "url": "https://news.ycombinator.com/item?id=44516439", 354 "created_at": "2025-07-10T01:50:50Z" 355 }, 356 { 357 "hn_id": "45226714", 358 "title": "Are ArXiv submissions on Wednesday better cited?", 359 "points": 2, 360 "comments": 0, 361 "url": "https://news.ycombinator.com/item?id=45226714", 362 "created_at": "2025-09-12T21:00:07Z" 363 }, 364 { 365 "hn_id": "44889206", 366 "title": "Large Language Models Do Not Simulate Human Psychology", 367 "points": 1, 368 "comments": 0, 369 "url": "https://news.ycombinator.com/item?id=44889206", 370 "created_at": "2025-08-13T14:50:01Z" 371 }, 372 { 373 "hn_id": "32619543", 374 "title": "Angle-agnostic cloaking from person-tracking systems with a t-shirt", 375 "points": 1, 376 "comments": 1, 377 "url": "https://news.ycombinator.com/item?id=32619543", 378 "created_at": "2022-08-27T14:42:49Z" 379 }, 380 { 381 "hn_id": "44521323", 382 "title": "Evaluating the Critical Risks of Amazon’s Nova Premier", 383 "points": 1, 384 "comments": 0, 385 "url": "https://news.ycombinator.com/item?id=44521323", 386 "created_at": "2025-07-10T14:11:52Z" 387 }, 388 { 389 "hn_id": "42705257", 390 "title": "What Hawking Radiation Looks Like as You Fall into a Black Hole", 391 "points": 1, 392 "comments": 0, 393 "url": "https://news.ycombinator.com/item?id=42705257", 394 "created_at": "2025-01-14T23:16:02Z" 395 } 396 ], 397 "top_points": 57, 398 "total_points": 73, 399 "total_comments": 37 400 } 401 }