scan.json (25489B)
1 { 2 "paper": { 3 "title": "DatasetResearch: Benchmarking Agent Systems for Demand-Driven Dataset Discovery", 4 "authors": [ 5 "Keyu Li", 6 "Mohan Jiang", 7 "Dayuan Fu", 8 "Yunze Wu", 9 "Xiangkun Hu", 10 "Dequan Wang", 11 "Pengfei Liu" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2508.06960", 16 "doi": "10.48550/arXiv.2508.06960" 17 }, 18 "scan_version": 3, 19 "active_modules": [ 20 "experimental_rigor", 21 "data_leakage" 22 ], 23 "methodology_tags": [ 24 "benchmark-eval" 25 ], 26 "key_findings": "DATASETRESEARCH benchmark with 208 real-world dataset demands reveals a stark performance gap: even advanced deep research systems achieve only 22% on the challenging pro subset. Search agents excel at knowledge-based tasks (42% fine-tuning score) while synthesis agents dominate reasoning tasks (73%). All current methods catastrophically fail on corner cases outside existing data distributions.", 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "GitHub repository provided: https://github.com/GAIR-NLP/DatasetResearch, referenced in the abstract." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": true, 37 "justification": "The benchmark data is stated as 'publicly available' at the GitHub link. The 208 demands and metadata are released." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "No requirements.txt, Dockerfile, or environment setup section is provided in the paper. Only a LlamaFactory config (Appendix D) is shown." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "No step-by-step reproduction instructions are provided in the paper. The LlamaFactory config in Appendix D is partial but doesn't constitute full reproduction instructions." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": false, 54 "justification": "Tables 2 and Figure 5 report point estimates only. No confidence intervals, error bars, or ± notation anywhere in the results." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper claims search agents 'excel' at knowledge tasks and synthesis agents are 'superior' at reasoning tasks, but no statistical tests support these comparisons." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "Results are reported as normalized scores with baselines (e.g., GPT-4o-search 42% vs o3 w/ref 73% for reasoning fine-tuning), providing enough context to assess magnitude." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "No justification for why 208 tasks, 20 for the pro subset, or 500 synthesis samples. The pro subset selection is based on GPT-4o-search difficulty but the size is arbitrary." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": false, 74 "justification": "No variance, standard deviation, or spread measures reported for any results. Single-run numbers throughout." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "Multiple baselines: GPT-4o-search, GPT-4o-mini-search, OpenAI o3 (w/ and w/o ref), and deep research agents (OpenAI, Grok, Gemini). Zero-shot LLaMA-3.1-8B as floor." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "Baselines include GPT-4o-search-preview, OpenAI o3, OpenAI Deep Research, Gemini Deep Research, and Grok Deep Research — all state-of-the-art at time of writing." 87 }, 88 "ablation_study": { 89 "applies": true, 90 "answer": false, 91 "justification": "No ablation study of the benchmark design or evaluation framework components. The w/ref vs w/o ref comparison is a partial ablation of one variable but not a systematic ablation." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": true, 96 "justification": "Six metrics used across task types: Accuracy, F1-Score, Exact Match, BLEU, SacreBLEU, ROUGE (Table 1). Also metadata evaluation scores." 97 }, 98 "human_evaluation": { 99 "applies": true, 100 "answer": false, 101 "justification": "Human verification is used in benchmark curation (Step 6) but no human evaluation of the agent systems' outputs. Metadata evaluation uses o3 as judge." 102 }, 103 "held_out_test_set": { 104 "applies": true, 105 "answer": true, 106 "justification": "Reference datasets serve as held-out test sets. Fine-tuned models are evaluated on reference sets they were not trained on. Gated datasets prevent search agents from accessing originals." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": true, 111 "justification": "Results broken down by knowledge vs reasoning tasks (Table 2), per-metric metadata scores across 6 dimensions, and per-method comparisons in Figure 5." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 6.3 discusses corner cases where all methods fail (Figure 8, medical coding example). Section 6.1-6.2 provide qualitative case studies." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper reports that all methods perform poorly overall (max 22% on pro subset), and explicitly discusses failure modes across all agent types." 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "Abstract claims of 22% max on pro subset, search/synthesis dichotomy, and corner case failures are all supported by Tables 2, Figure 5, and Section 6 analysis." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper claims synthesis agents' 'advantage stems from their ability to generate reasoning-rich, more instruction-aligned output data' — this is a causal claim from observational comparison without controlled experiment isolating this factor." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": false, 138 "justification": "The abstract claims this benchmarks 'AI agents' ability to discover and synthesize datasets' generally, but results are limited to NLP text tasks with LLaMA-3.1-8B as the only evaluation model. Title claims 'demand-driven dataset discovery' broadly." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": false, 143 "justification": "No discussion of alternative explanations for the search/synthesis performance gap. Could be prompt design, dataset formatting by o3, or LLaMA-3.1-8B's specific characteristics rather than fundamental agent properties." 144 }, 145 "proxy_outcome_distinction": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper measures fine-tuning performance on LLaMA-3.1-8B as a proxy for 'dataset quality' and 'dataset discovery capability' without discussing whether this proxy captures what matters. A dataset could be valuable for other models or purposes." 149 } 150 }, 151 "setup_transparency": { 152 "model_versions_specified": { 153 "applies": true, 154 "answer": false, 155 "justification": "Models referred to by marketing names: 'GPT-4o-search-preview', 'OpenAI o3', 'Grok Deep Research'. No API versions, snapshot dates, or exact model identifiers provided." 156 }, 157 "prompts_provided": { 158 "applies": true, 159 "answer": true, 160 "justification": "Full prompts provided in Appendix C: demand description generation (C.1), search agent (C.2), synthesis agent (C.2), fine-tuning data extraction (C.2), metadata generation and evaluation (C.3)." 161 }, 162 "hyperparameters_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Appendix D provides full LlamaFactory fine-tuning config including learning rate (1e-5), batch size, epochs (3), cutoff length (4096), scheduler, etc. Synthesis uses 10 samples × 50 iterations." 166 }, 167 "scaffolding_described": { 168 "applies": false, 169 "answer": false, 170 "justification": "The paper evaluates third-party deep research agents (OpenAI, Grok, Gemini) as black boxes. The search and synthesis agents are simple prompt-based workflows, not agentic scaffolds." 171 }, 172 "data_preprocessing_documented": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 3.2 documents 7 filtering steps with counts: 1M+ → 422 (task/modality filter) → 261 (documentation check) → 104 (fine-tuning suitability) → 91 HuggingFace + 117 PapersWithCode = 208." 176 } 177 }, 178 "limitations_and_scope": { 179 "limitations_section_present": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 7.2 'Limitation and Future Work' discusses three specific limitations: web-scale curation, open-source model evaluation, and hybrid agent approaches." 183 }, 184 "threats_to_validity_specific": { 185 "applies": true, 186 "answer": false, 187 "justification": "Section 7.2 discusses future work directions rather than specific threats to the current study's validity. No mention of LLM-as-judge bias, single evaluation model limitation, or prompt sensitivity." 188 }, 189 "scope_boundaries_stated": { 190 "applies": true, 191 "answer": false, 192 "justification": "Section 7.2 mentions the benchmark 'relies on datasets from structured repositories' but does not explicitly bound claims. The paper's sweeping language ('finding any dataset in the digital universe') is not bounded." 193 } 194 }, 195 "data_integrity": { 196 "raw_data_available": { 197 "applies": true, 198 "answer": true, 199 "justification": "Benchmark and data are stated as publicly available at the GitHub repo. Reference datasets, demands, and metadata are released." 200 }, 201 "data_collection_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Section 3.2 details 7-step collection pipeline from HuggingFace API filtering through human verification to metadata generation." 205 }, 206 "recruitment_methods_described": { 207 "applies": false, 208 "answer": false, 209 "justification": "No human participants. Data sourced from HuggingFace and PapersWithCode public repositories." 210 }, 211 "data_pipeline_documented": { 212 "applies": true, 213 "answer": true, 214 "justification": "Full pipeline documented in Section 3.2 and Figure 2, with counts at each stage and filtering criteria described." 215 } 216 }, 217 "conflicts_of_interest": { 218 "funding_disclosed": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding or acknowledgments section found in the paper." 222 }, 223 "affiliations_disclosed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Author affiliations listed: Shanghai Jiao Tong University, SII, GAIR." 227 }, 228 "funder_independent_of_outcome": { 229 "applies": true, 230 "answer": false, 231 "justification": "No funding information disclosed, so independence cannot be assessed." 232 }, 233 "financial_interests_declared": { 234 "applies": true, 235 "answer": false, 236 "justification": "No competing interests or financial disclosure statement in the paper." 237 } 238 }, 239 "contamination": { 240 "training_cutoff_stated": { 241 "applies": true, 242 "answer": false, 243 "justification": "No training cutoff dates stated for any of the evaluated models (GPT-4o, o3, LLaMA-3.1-8B, etc.)." 244 }, 245 "train_test_overlap_discussed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 3.2 Step 1 explicitly uses 'gated' datasets to mitigate data leakage, as 'search agents cannot automatically download and process these datasets even if they are identified.' This is a partial but deliberate mitigation." 249 }, 250 "benchmark_contamination_addressed": { 251 "applies": true, 252 "answer": false, 253 "justification": "While gated datasets prevent download, the metadata/README files of these datasets may be in model training data. The paper doesn't discuss whether models have seen the dataset descriptions, names, or samples through their training data." 254 } 255 }, 256 "human_studies": { 257 "pre_registered": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "irb_or_ethics_approval": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "demographics_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "inclusion_exclusion_criteria": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "randomization_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "blinding_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 }, 287 "attrition_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants." 291 } 292 }, 293 "cost_and_practicality": { 294 "inference_cost_reported": { 295 "applies": true, 296 "answer": false, 297 "justification": "No API costs, token counts, or per-task costs reported despite heavy use of o3 for metadata generation, evaluation, and synthesis (50 iterations × 208 tasks)." 298 }, 299 "compute_budget_stated": { 300 "applies": true, 301 "answer": false, 302 "justification": "No total compute budget stated. Full-parameter fine-tuning of LLaMA-3.1-8B is performed but GPU hours and hardware are not specified." 303 } 304 }, 305 "experimental_rigor": { 306 "seed_sensitivity_reported": { 307 "applies": true, 308 "answer": false, 309 "justification": "No mention of multiple random seeds for fine-tuning experiments. Results appear to be single-run." 310 }, 311 "number_of_runs_stated": { 312 "applies": true, 313 "answer": false, 314 "justification": "Number of experimental runs not stated anywhere. Cannot determine if results are single-run or averaged." 315 }, 316 "hyperparameter_search_budget": { 317 "applies": true, 318 "answer": false, 319 "justification": "Appendix D shows 'fixed hyperparameters' for fine-tuning, but no discussion of whether these were tuned or how they were selected." 320 }, 321 "best_config_selection_justified": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper states 'fixed hyperparameters' but doesn't justify why these specific values were chosen or whether any tuning was performed." 325 }, 326 "multiple_comparison_correction": { 327 "applies": true, 328 "answer": false, 329 "justification": "No statistical tests performed at all, so no multiple comparison correction either." 330 }, 331 "self_comparison_bias_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "The authors designed the benchmark and evaluation framework, then evaluated agents on it. No discussion of potential bias in benchmark design favoring certain agent types." 335 }, 336 "compute_budget_vs_performance": { 337 "applies": true, 338 "answer": false, 339 "justification": "Deep research agents likely use far more compute than simple search queries, but no compute-normalized comparison is provided." 340 }, 341 "benchmark_construct_validity": { 342 "applies": true, 343 "answer": false, 344 "justification": "No discussion of whether fine-tuning LLaMA-3.1-8B performance actually measures 'dataset discovery quality.' The proxy relationship between fine-tuning success and dataset utility is not examined." 345 }, 346 "scaffold_confound_addressed": { 347 "applies": false, 348 "answer": false, 349 "justification": "The paper evaluates different agent paradigms (search vs synthesis vs deep research) as distinct approaches rather than comparing models within a shared scaffold." 350 } 351 }, 352 "data_leakage": { 353 "temporal_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluated models were trained on data that includes the reference datasets' documentation, samples, or metadata." 357 }, 358 "feature_leakage_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The demand descriptions are generated from reference dataset metadata by o3. No discussion of whether these descriptions leak specific information that would make discovery trivial." 362 }, 363 "non_independence_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "No discussion of potential overlap between the datasets used for fine-tuning and the reference test sets, or between o3's training data and the benchmark datasets." 367 }, 368 "leakage_detection_method": { 369 "applies": true, 370 "answer": true, 371 "justification": "Gated dataset selection (Step 1) is a concrete leakage prevention method — restricting to datasets that require manual approval prevents automatic download by search agents." 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "Even advanced deep research systems achieve only 22% score on DatasetResearch-pro subset.", 378 "evidence": "Figure 5 shows OpenAI DeepResearch achieving 0.2218 fine-tuning score on the pro subset (Section 5.2).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Search agents excel at knowledge-based tasks while synthesis agents dominate reasoning-based tasks.", 383 "evidence": "Table 2: GPT-4o-search achieves 42% fine-tuning on knowledge tasks; OpenAI o3 w/ref achieves 73% on reasoning tasks.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Few-shot evaluation results are closely aligned with fine-tuning experiments, making few-shot a viable preliminary assessment.", 388 "evidence": "Table 2 shows relative trends between few-shot and fine-tuning are consistent across methods (Section 5.2).", 389 "supported": "weak" 390 }, 391 { 392 "claim": "All current methods fail on corner cases outside existing data distributions.", 393 "evidence": "Figure 8 shows degraded performance on medical coding task. Section 6.3 discusses the limitation qualitatively.", 394 "supported": "weak" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "LLM-as-judge circularity", 400 "detail": "OpenAI o3 is used to generate reference metadata, generate demand descriptions, synthesize datasets, format search results, AND judge metadata similarity scores. This creates circular dependencies where the judge favors its own outputs." 401 }, 402 { 403 "flag": "Single evaluation model", 404 "detail": "All fine-tuning evaluation uses only LLaMA-3.1-8B. Results may not generalize to other model sizes or architectures. A dataset good for LLaMA-8B may not be good for other models." 405 }, 406 { 407 "flag": "No variance or repeated runs", 408 "detail": "Fine-tuning results are reported without any variance measures, error bars, or repeated runs. Full-parameter fine-tuning of 8B models is known to be sensitive to random seeds." 409 }, 410 { 411 "flag": "Pro subset selection bias", 412 "detail": "DatasetResearch-pro is defined as the 20 hardest tasks for GPT-4o-search-preview. This systematically biases the pro subset toward tasks where search fails, potentially inflating the relative advantage of synthesis methods." 413 }, 414 { 415 "flag": "Overclaiming in abstract", 416 "detail": "Abstract claims about 'finding any dataset in the digital universe' and 'self-improving AI systems' significantly overreach the actual scope of evaluating NLP text datasets from HuggingFace/PapersWithCode." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "DataFinder: Scientific dataset recommendation from natural language descriptions", 422 "authors": [ 423 "Vijay Viswanathan", 424 "Luyu Gao", 425 "Tongshuang Wu", 426 "Pengfei Liu", 427 "Graham Neubig" 428 ], 429 "year": 2023, 430 "arxiv_id": "2305.16636", 431 "relevance": "Prior work on automated dataset recommendation from natural language queries." 432 }, 433 { 434 "title": "Better synthetic data by retrieving and transforming existing datasets", 435 "authors": [ 436 "Saumya Gandhi", 437 "Ritu Gala", 438 "Vijay Viswanathan", 439 "Tongshuang Wu", 440 "Graham Neubig" 441 ], 442 "year": 2024, 443 "arxiv_id": "2404.14361", 444 "relevance": "Dataset transformation techniques for repurposing existing datasets, a precursor to synthesis-based approaches." 445 }, 446 { 447 "title": "DeepResearcher: Scaling deep research via reinforcement learning in real-world environments", 448 "authors": [ 449 "Yuxiang Zheng", 450 "Dayuan Fu", 451 "Xiangkun Hu" 452 ], 453 "year": 2025, 454 "arxiv_id": "2504.03160", 455 "relevance": "Deep research agent using RL, evaluated in this benchmark." 456 }, 457 { 458 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 459 "authors": [ 460 "Carlos E Jimenez", 461 "John Yang", 462 "Alexander Wettig" 463 ], 464 "year": 2023, 465 "arxiv_id": "2310.06770", 466 "relevance": "Major benchmark for evaluating LLM coding agents on real-world tasks." 467 }, 468 { 469 "title": "SWE-smith: Scaling data for software engineering agents", 470 "authors": [ 471 "John Yang", 472 "Kilian Leret", 473 "Carlos E Jimenez" 474 ], 475 "year": 2025, 476 "arxiv_id": "2504.21798", 477 "relevance": "Data scaling for SE agents, directly relevant to AI-assisted software engineering research." 478 }, 479 { 480 "title": "ScienceAgentBench: Toward rigorous assessment of language agents for data-driven scientific discovery", 481 "authors": [ 482 "Ziru Chen", 483 "Shijie Chen", 484 "Yuting Ning" 485 ], 486 "year": 2024, 487 "arxiv_id": "2410.05080", 488 "relevance": "Benchmark for scientific discovery agents with rigorous evaluation methodology." 489 }, 490 { 491 "title": "Judging LLM-as-a-judge with MT-bench and Chatbot Arena", 492 "authors": [ 493 "Lianmin Zheng", 494 "Wei-Lin Chiang", 495 "Ying Sheng" 496 ], 497 "year": 2023, 498 "relevance": "Foundational work on LLM-as-judge evaluation methodology used in this paper's metadata scoring." 499 }, 500 { 501 "title": "Training language models to follow instructions with human feedback", 502 "authors": [ 503 "Long Ouyang", 504 "Jeffrey Wu", 505 "Xu Jiang" 506 ], 507 "year": 2022, 508 "relevance": "RLHF instruction tuning methodology used as basis for the fine-tuning evaluation approach." 509 }, 510 { 511 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 512 "authors": [ 513 "Daya Guo", 514 "Dejian Yang", 515 "Haowei Zhang" 516 ], 517 "year": 2025, 518 "arxiv_id": "2501.12948", 519 "relevance": "RL-based reasoning model relevant to agentic AI capabilities." 520 }, 521 { 522 "title": "Are emergent abilities of large language models a mirage?", 523 "authors": [ 524 "Rylan Schaeffer", 525 "Brando Miranda", 526 "Sanmi Koyejo" 527 ], 528 "year": 2023, 529 "relevance": "Analysis of LLM scaling properties, cited for long-context attention limitations affecting few-shot results." 530 } 531 ], 532 "engagement_factors": { 533 "practical_relevance": { 534 "score": 1, 535 "justification": "Benchmarks dataset discovery agents but doesn't provide a usable tool — practitioners can't directly apply this to their workflows." 536 }, 537 "surprise_contrarian": { 538 "score": 1, 539 "justification": "The 22% ceiling for deep research systems is notable but 'AI struggles on hard benchmark' is a familiar narrative, not a contrarian finding." 540 }, 541 "fear_safety": { 542 "score": 0, 543 "justification": "No safety, security, or risk angle whatsoever." 544 }, 545 "drama_conflict": { 546 "score": 0, 547 "justification": "No controversy or conflict — straightforwardly evaluates systems without challenging specific company claims." 548 }, 549 "demo_ability": { 550 "score": 1, 551 "justification": "Code is public on GitHub but reproducing requires fine-tuning LLaMA-3.1-8B, multiple API keys, and significant compute." 552 }, 553 "brand_recognition": { 554 "score": 1, 555 "justification": "From Shanghai Jiao Tong University/GAIR — recognized in NLP but not a household name in broader tech circles." 556 } 557 } 558 }