scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19534B)
      1 {
      2   "scan_version": 3,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "DataDreamer: A Tool for Synthetic Data Generation and Reproducible LLM Workflows",
      6     "authors": [
      7       "Ajay Patel",
      8       "Colin Raffel",
      9       "Chris Callison-Burch"
     10     ],
     11     "year": 2024,
     12     "venue": "ACL 2024 Theme Track",
     13     "arxiv_id": "2402.10379"
     14   },
     15   "methodology_tags": [
     16     "case-study"
     17   ],
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository provided: https://github.com/datadreamer-dev/DataDreamer. Installation via pip install datadreamer.dev is documented."
     24       },
     25       "data_released": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "This is a tool paper with no dataset produced by the research itself. The tool helps others produce datasets."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions pip install datadreamer.dev but provides no requirements.txt, Python version requirements, or dependency specifications for the library itself."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Multiple complete code examples are provided (Examples 1-5) showing how to install and use the tool, with specific model names, hyperparameters, and configurations."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No empirical experiments with quantitative results are reported. This is a tool/system paper."
     46       },
     47       "significance_tests": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No comparative empirical claims are made. The paper describes a tool, not experimental results."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No empirical experiments with quantitative results are reported."
     56       },
     57       "sample_size_justified": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No empirical experiments conducted."
     61       },
     62       "variance_reported": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "No empirical experiments conducted."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 1 compares DataDreamer's feature coverage against LangChain, Axolotl, and HF Transformers + TRL. However, this is a feature checklist comparison, not an empirical benchmark."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "LangChain, Axolotl, and HF Transformers + TRL were all contemporary and widely-used tools at time of publication."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "This is a tool paper, not a system with components to ablate."
     83       },
     84       "multiple_metrics": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No empirical evaluation is conducted. Feature comparison is qualitative."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No user study or human evaluation of the tool's usability, learnability, or effectiveness is reported."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No empirical experiments with train/test splits are conducted as part of the paper's evaluation."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 1 provides per-feature breakdown across categories (Implementation, Integrations, Tasks, Conveniences, Open Science and Reproducibility) for each compared tool."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases or limitations of the tool's functionality are discussed beyond the high-level limitations section about closed-source models."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No negative results or unsuccessful design decisions are discussed."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims DataDreamer is an open source library for LLM workflows with reproducibility features. The paper demonstrates these features through code examples and feature descriptions. No unsupported quantitative claims."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims DataDreamer 'can help advance the rate of research progress' and that its 'adoption can help advance the rate of research progress in workflows involving LLMs by making implementation easier and making research output reproducible.' These are causal claims without evidence of actual adoption impact."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper is appropriately scoped to NLP/LLM workflows and does not overclaim applicability beyond its supported models and tasks."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "No empirical results are presented that would require alternative explanations."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "This is a tool paper that presents DataDreamer as a software library. It makes no empirical claims about outcomes based on measurements. The paper describes functionality and best practices rather than measuring and claiming an outcome. No proxy-outcome gap exists."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The demonstration uses 'gpt-4' (Example 1 line 9: model_name=\"gpt-4\") without a version or snapshot date. Other examples use specific model names like 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T' and 'google/t5-v1_1-base' which are versioned HF model IDs."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt text is provided in the code examples, e.g., 'Generate an arXiv abstract of an NLP research paper. Return just the abstract, no titles.' in Example 1."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Code examples include hyperparameters: temperature=1.2, top_p=1.0, epochs=30, batch_size=8, gradient_accumulation_steps=32, dtype='bfloat16', LoRA config, etc."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. DataDreamer is a workflow orchestration library, not an agent system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No experimental data is collected or preprocessed. Code examples are demonstrations, not experiments."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "A dedicated 'Limitations' section discusses challenges with closed-source models and API reproducibility."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The limitations section discusses only the general issue of closed-source model reproducibility. No specific threats to the paper's claims or design decisions are discussed."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what DataDreamer does NOT cover or what workflow types are out of scope. It acknowledges closed-source model limitations but does not bound its claims about research impact."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": false,
    189         "answer": false,
    190         "justification": "No experimental data is collected. This is a tool paper."
    191       },
    192       "data_collection_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No data collection is performed."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No participants or samples are recruited."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No experimental data pipeline exists."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgements section states: 'This research is supported in part by the Office of the Director of National Intelligence (ODNI), Intelligence Advanced Research Projects Activity (IARPA), via the HIATUS Program contract #2022-22072200005.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: University of Pennsylvania and University of Toronto / Vector Institute."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "ODNI/IARPA is a government research funder with no commercial stake in the DataDreamer tool itself."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It demonstrates a tool."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No benchmark evaluation is conducted."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No benchmark evaluation is conducted."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "This is a tool paper, not an empirical method paper. The tool itself helps others track costs but no method costs are reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No experiments requiring significant compute are conducted by this paper."
    294       }
    295     }
    296   },
    297   "claims": [
    298     {
    299       "claim": "DataDreamer provides a single library with standardized interface for LLM workflows including synthetic data generation, fine-tuning, instruction-tuning, and alignment.",
    300       "evidence": "Table 1 compares feature coverage across LangChain, Axolotl, HF Transformers+TRL, and DataDreamer. Table 2 lists built-in steps, models, and trainers. Code examples 1-5 demonstrate these workflows.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "DataDreamer automatically produces reproducibility fingerprints, synthetic data cards, and model cards that aid open science.",
    305       "evidence": "Table 3 defines information recorded in synthetic data/model cards including date/time, dataset/model names, licenses, citations, reproducibility fingerprints, and environment information. Section 5 describes reproducibility practices.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "DataDreamer's adoption can help advance the rate of research progress in workflows involving LLMs.",
    310       "evidence": "No empirical evidence of adoption impact. This is a forward-looking claim in Section 6 (Conclusion).",
    311       "supported": "unsupported"
    312     }
    313   ],
    314   "key_findings": "DataDreamer is an open-source Python library that unifies LLM workflow tasks (synthetic data generation, fine-tuning, alignment, distillation) under a standardized interface with automatic reproducibility features including caching, reproducibility fingerprints, and synthetic data/model cards. The paper compares feature coverage against LangChain, Axolotl, and HF Transformers+TRL, showing DataDreamer covers the broadest range of capabilities in a single library. No empirical evaluation of tool effectiveness or user productivity is provided.",
    315   "red_flags": [
    316     {
    317       "flag": "No empirical evaluation",
    318       "detail": "The paper introduces a tool but provides no empirical evaluation — no user study, no benchmark of tool effectiveness, no comparison of researcher productivity or code complexity. All claims about utility are based on feature descriptions and code examples."
    319     },
    320     {
    321       "flag": "Feature comparison is self-reported",
    322       "detail": "Table 1 comparing DataDreamer against alternatives is a self-reported feature checklist created by the tool's authors. No independent verification or empirical testing of these feature claims is provided."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Quantifying language models' sensitivity to spurious features in prompt design",
    328       "authors": [
    329         "Melanie Sclar",
    330         "Yejin Choi",
    331         "Yulia Tsvetkov",
    332         "Alane Suhr"
    333       ],
    334       "year": 2023,
    335       "arxiv_id": "2310.11324",
    336       "relevance": "Demonstrates prompt sensitivity problem that motivates DataDreamer's reproducibility features for LLM workflows."
    337     },
    338     {
    339       "title": "The curse of recursion: Training on generated data makes models forget",
    340       "authors": [
    341         "Ilia Shumailov",
    342         "Zakhar Shumaylov",
    343         "Yiren Zhao",
    344         "Yarin Gal",
    345         "Nicolas Papernot",
    346         "Ross Anderson"
    347       ],
    348       "year": 2023,
    349       "relevance": "Motivates DataDreamer's synthetic data card feature to prevent model collapse from training on unlabeled synthetic data."
    350     },
    351     {
    352       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    353       "authors": [
    354         "Lianmin Zheng"
    355       ],
    356       "year": 2023,
    357       "arxiv_id": "2306.05685",
    358       "relevance": "LLM-as-judge evaluation paradigm that DataDreamer supports as a workflow type."
    359     },
    360     {
    361       "title": "Direct preference optimization: Your language model is secretly a reward model",
    362       "authors": [
    363         "Rafael Rafailov",
    364         "Archit Sharma",
    365         "Eric Mitchell"
    366       ],
    367       "year": 2023,
    368       "arxiv_id": "2305.18290",
    369       "relevance": "DPO alignment technique integrated into DataDreamer's trainer system."
    370     },
    371     {
    372       "title": "Self-rewarding language models",
    373       "authors": [
    374         "Weizhe Yuan",
    375         "Richard Yuanzhe Pang",
    376         "Kyunghyun Cho"
    377       ],
    378       "year": 2024,
    379       "arxiv_id": "2401.10020",
    380       "relevance": "Self-improving LLM workflow that DataDreamer supports through multi-round chaining of generation and training."
    381     },
    382     {
    383       "title": "LoRA: Low-rank adaptation of large language models",
    384       "authors": [
    385         "Edward J Hu"
    386       ],
    387       "year": 2021,
    388       "arxiv_id": "2106.09685",
    389       "relevance": "Parameter-efficient fine-tuning technique integrated into DataDreamer's training system."
    390     },
    391     {
    392       "title": "Distilling step-by-step! Outperforming larger language models with less training data and smaller model sizes",
    393       "authors": [
    394         "Cheng-Yu Hsieh"
    395       ],
    396       "year": 2023,
    397       "arxiv_id": "2305.02301",
    398       "relevance": "Distillation workflow that DataDreamer supports as a core use case."
    399     },
    400     {
    401       "title": "Training language models to follow instructions with human feedback",
    402       "authors": [
    403         "Long Ouyang",
    404         "Jeffrey Wu"
    405       ],
    406       "year": 2022,
    407       "relevance": "InstructGPT/RLHF approach that DataDreamer's alignment trainers implement."
    408     },
    409     {
    410       "title": "AlpacaFarm: A simulation framework for methods that learn from human feedback",
    411       "authors": [
    412         "Yann Dubois"
    413       ],
    414       "year": 2023,
    415       "arxiv_id": "2305.14387",
    416       "relevance": "Related framework for LLM feedback simulation, relevant to evaluating LLM workflow tools."
    417     },
    418     {
    419       "title": "Compressing LLMs: The truth is rarely pure and never simple",
    420       "authors": [
    421         "Ajay Jaiswal"
    422       ],
    423       "year": 2023,
    424       "arxiv_id": "2310.01382",
    425       "relevance": "Shows quantization affects model outputs, motivating DataDreamer's reproducibility fingerprints that capture optimization configurations."
    426     }
    427   ],
    428   "engagement_factors": {
    429     "practical_relevance": {
    430       "score": 3,
    431       "justification": "Pip-installable Python library with clear API for synthetic data generation, fine-tuning, and alignment workflows that practitioners can integrate immediately."
    432     },
    433     "surprise_contrarian": {
    434       "score": 0,
    435       "justification": "Confirms the known need for better LLM workflow tooling and reproducibility without challenging any conventional wisdom."
    436     },
    437     "fear_safety": {
    438       "score": 0,
    439       "justification": "No safety or risk angle; focuses on reproducibility and workflow convenience."
    440     },
    441     "drama_conflict": {
    442       "score": 0,
    443       "justification": "No controversy or conflict; the feature comparison table is mild and self-reported rather than adversarial."
    444     },
    445     "demo_ability": {
    446       "score": 3,
    447       "justification": "Open-source pip-installable tool with extensive code examples that users can try immediately with a single pip install command."
    448     },
    449     "brand_recognition": {
    450       "score": 1,
    451       "justification": "University of Pennsylvania and Colin Raffel are recognized in NLP but not household names in the broader tech community."
    452     }
    453   }
    454 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs