scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21451B)
      1 {
      2   "paper": {
      3     "title": "Show Your Work: Improved Reporting of Experimental Results",
      4     "authors": ["Jesse Dodge", "Suchin Gururangan", "Dallas Card", "Roy Schwartz", "Noah A. Smith"],
      5     "year": 2019,
      6     "venue": "EMNLP",
      7     "arxiv_id": "1909.03004"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Code released at https://github.com/allenai/allentune (footnote 1) and https://github.com/allenai/show-your-work (Section 4.2, footnote 11)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available datasets (SST, SciTail, SQuAD). Hyperparameter search results are implicitly available through the code repository."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Computing infrastructure is specified per experiment in the appendix tables (e.g., 'GeForce GTX 1080 GPU', '3.1 GHz Intel Core i7 CPU', 'NVIDIA Titan Xp GPU'). AllenNLP platform is referenced. However, no requirements.txt or detailed library versions are provided, so this is borderline. The GPU and software platform (AllenNLP, scikit-learn) are stated."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Appendix tables B-D provide complete hyperparameter search spaces, best assignments, number of trials, search strategy, and computing infrastructure for every experiment. Code is linked. This is sufficient for reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Standard deviation is reported as shaded regions in figures (e.g., Fig. 4: 'shaded area represents the expected performance ±1 standard deviation'). Section 3.3 discusses variance estimation."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims about model rankings changing with budget but does not use statistical significance tests for these comparisons. Rankings are compared visually from expected validation performance curves."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports concrete performance differences with context, e.g., models require budgets ranging from 2 to 20 hyperparameter trials to match reported performance (Section 4.4), and 18 GPU days for BiDAF on SQuAD (Section 4.4). The EMNLP survey reports specific percentages (e.g., '74% reported best hyperparameter assignments', '10% or fewer reported search bounds')."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The EMNLP 2018 survey uses 50 randomly sampled papers (Section 5) but does not justify why 50 was chosen or discuss statistical power for the prevalence estimates."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 3.3 is dedicated to computing variance of the expected maximum. Fig. 4 shows ±1 standard deviation. The paper explicitly advocates for reporting variance (Section 3.3: 'Expected performance becomes more useful with an estimate of variation')."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares its expected maximum technique against the bootstrap method (Section 3.2) and against current practice of reporting single best test scores."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The bootstrap is the standard non-parametric method for estimating statistics without closed form. The paper also compares against models from contemporary work (Peters et al. 2019, Khot et al. 2018)."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "The proposed technique (expected validation performance) is a single mathematical method with no separable components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses accuracy on SST (both fine-grained and binary), accuracy on SciTail, and exact match on SQuAD across different experiments."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "The paper proposes a reporting methodology based on mathematical computation of expected validation performance. Human evaluation of outputs is not relevant to these claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper explicitly distinguishes validation and test performance. The expected maximum is computed on validation data, and test results are reported separately (Section 4, Fig. 1)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model family (LR vs CNN, DGEM vs DAM vs ESIM vs n-gram, BCN with different embeddings) and per dataset (SST, SciTail, SQuAD). The EMNLP survey provides per-item breakdown in Table 1."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5 discusses limitations of the approach, including that fixing the same number of hyperparameter trials does not imply fair comparison, and that manual tuning history cannot be captured. Section 4.4 notes that hyperparameter search space choices affect results."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper shows cases where model rankings reverse with different budgets (Fig. 1, Fig. 3), which is itself a negative result about current reporting practices. Section 5 acknowledges 'there may be no simple way to make a comparison fair.'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that (1) test-set scores alone are insufficient (supported by Figs. 1, 3 showing ranking reversals), (2) they find multiple cases where conclusions change with budget (Sections 4.2-4.4), and (3) massive variation in compute required (Section 4.4, 'from hours to weeks'). All are supported."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The main causal claim is that varying computational budget causes changes in relative model rankings. This is demonstrated through controlled experiments where the same models are evaluated under systematically varied budgets, which is adequate causal design."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper focuses on NLP and frames its contributions accordingly. Section 5 discusses limitations of the approach (e.g., manual tuning cannot be captured, search space choices matter). Claims are bounded to the tested settings."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5 discusses alternative explanations: different hyperparameter spaces could change results, prior human experience affects search bounds for popular models, implementation differences affect runtime comparisons. Footnote 13 acknowledges 'different choices could result in better or worse expected performance.'"
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model implementations are cited (AllenNLP BCN, scikit-learn logistic regression, implementations from Khot et al. 2018, Seo et al. 2017). For NLP models in 2019, these are well-defined architectures, not API-based models with versioning issues."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use prompting. It trains traditional NLP models (CNNs, LSTMs, logistic regression) with hyperparameter search."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix tables B-D provide exhaustive hyperparameter search spaces and best assignments for every experiment. This is exemplary reporting — the paper practices what it preaches."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The paper trains traditional NLP models."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Standard datasets are used (SST, SciTail, SQuAD) with standard splits. The EMNLP survey describes sampling 50 random papers. Preprocessing for the models follows the cited implementations."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 ('Recommendations') contains substantive discussion of limitations, including the difficulty of fair comparisons when hyperparameter spaces differ, the inability to account for manual tuning history, and implementation differences."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5 discusses specific threats: 'the two models in Fig. 1 have hyperparameter spaces that are different, so fixing the same number of hyperparameter trials for both models does not imply a fair comparison.' Footnote 13 acknowledges search space choices affect results."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 states 'there may be no simple way to make a comparison fair' and discusses specific things the approach does NOT address: manual tuning history, implementation-dependent runtime differences, and adaptive search strategies."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Code is released at github.com/allenai/allentune and github.com/allenai/show-your-work, which would contain the raw hyperparameter search results. Standard public datasets are used."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "For experiments: uniform random hyperparameter sampling with specified search spaces and number of trials (50-128). For the EMNLP survey: '50 random EMNLP 2018 papers that include experimental results' (Section 5)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The EMNLP paper survey sampled papers, not people, and the selection method (random from EMNLP 2018) is described in the paper."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from hyperparameter sampling to expected maximum computation is mathematically specified in Section 3. The EMNLP survey pipeline is straightforward (random sample, evaluate checklist items)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments state 'This work was completed while the first author was an intern at the Allen Institute for Artificial Intelligence.' Authors are affiliated with AI2, CMU, and UW."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All author affiliations are listed: CMU, Allen Institute for AI, UW. The paper does not evaluate any product from these organizations, so no conflict exists."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "AI2 and university affiliations have no financial stake in whether reporting practices improve. The paper advocates for methodological reform, not a commercial product."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper trains traditional NLP models from scratch (CNNs, LSTMs, logistic regression). It does not evaluate pre-trained model capabilities on benchmarks in a way where training data contamination is relevant."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — models are trained from scratch on standard splits. No pre-trained model benchmark evaluation."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above — not applicable to this paper type."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Training durations are reported per experiment in appendix tables (e.g., 39 sec for CNN on SST, 31617 sec for BiDAF on SQuAD). The paper's central argument is about computational cost of hyperparameter search."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Number of search trials (50-128), GPU hardware, and per-trial training durations are all reported in the appendix. Total compute is derivable (e.g., 128 trials × ~8.8 hours = ~47 GPU days for SQuAD BiDAF)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Test-set performance scores alone are insufficient for drawing accurate conclusions about which model performs best.",
    286       "evidence": "Figures 1 and 3 show that relative model rankings change depending on computational budget (number of hyperparameter trials). On SST, LR outperforms CNN with fewer than 10 trials but CNN is better with larger budgets.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Multiple recent model comparisons would have reached different conclusions with more or less computation.",
    291       "evidence": "Section 4.4 shows SciTail model rankings vary with budget (Fig. 3), and Section 4.3 shows ELMo embedding approaches have different budget-performance tradeoffs (Fig. 2).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The amount of computation required to obtain reported results varies massively across papers, from hours to weeks.",
    296       "evidence": "Section 4.4: BiDAF on SQuAD requires ~18 GPU days (55 trials) to match reported performance in expectation (Fig. 4), while simpler models on SciTail require only 2 trials.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "None of 50 sampled EMNLP 2018 papers reported all checklist items; 10% or fewer reported hyperparameter search bounds, number of trials, or score distributions.",
    301       "evidence": "Table 1 in Appendix A provides the full breakdown across 50 randomly sampled EMNLP 2018 papers.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "The expected maximum method has strictly less error than the bootstrap for estimating expected best validation performance.",
    306       "evidence": "Section 3.2 argues the closed-form solution eliminates resampling error while retaining the same finite sample error. This is a mathematical argument, not an empirical comparison.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "theoretical"],
    311   "key_findings": "The paper demonstrates that reporting only test-set performance is insufficient because relative model rankings change depending on computational budget for hyperparameter search. Using a closed-form method for computing expected validation performance as a function of budget, the authors show that on SciTail and SST, the best model depends on how many hyperparameter configurations are tried. A survey of 50 EMNLP 2018 papers found that 10% or fewer reported hyperparameter search bounds, number of trials, or score distributions. The paper proposes a reporting checklist and releases the allentune tool for computing expected validation performance curves.",
    312   "red_flags": [],
    313   "cited_papers": [
    314     {
    315       "title": "Troubling Trends in Machine Learning Scholarship",
    316       "authors": ["Zachary C. Lipton", "Jacob Steinhardt"],
    317       "year": 2018,
    318       "relevance": "Directly addresses methodological problems in ML research including incorrectly attributing empirical gains to modeling choices."
    319     },
    320     {
    321       "title": "Green AI",
    322       "authors": ["Roy Schwartz", "Jesse Dodge", "Noah A. Smith", "Oren Etzioni"],
    323       "year": 2019,
    324       "relevance": "Addresses computational cost and efficiency in AI research, directly relevant to cost and practicality assessment."
    325     },
    326     {
    327       "title": "Winner's Curse? On Pace, Progress, and Empirical Rigor",
    328       "authors": ["D. Sculley", "Jasper Snoek", "Ali Rahimi", "Alex Wiltschko"],
    329       "year": 2018,
    330       "relevance": "Calls for stronger standards for empirical evaluation in ML, directly relevant to methodology quality assessment."
    331     },
    332     {
    333       "title": "Reporting Score Distributions Makes a Difference: Performance Study of LSTM-Networks for Sequence Tagging",
    334       "authors": ["Nils Reimers", "Iryna Gurevych"],
    335       "year": 2017,
    336       "relevance": "Demonstrates that reporting score distributions instead of single scores changes conclusions about model comparisons."
    337     },
    338     {
    339       "title": "Deep Reinforcement Learning That Matters",
    340       "authors": ["Peter Henderson", "Riashat Islam", "Philip Bachman", "Joelle Pineau", "Doina Precup", "David Meger"],
    341       "year": 2018,
    342       "relevance": "Addresses reproducibility and reporting issues in deep RL, a parallel concern to the NLP reporting issues in this paper."
    343     },
    344     {
    345       "title": "Are GANs Created Equal? A Large-Scale Study",
    346       "authors": ["Mario Lucic", "Karol Kurach", "Marcin Michalski", "Olivier Bousquet", "Sylvain Gelly"],
    347       "year": 2018,
    348       "relevance": "Found competing GAN methods to be comparable rather than clearly superior, exposing the need for better reporting standards."
    349     },
    350     {
    351       "title": "Datasheets for Datasets",
    352       "authors": ["Timnit Gebru", "Jamie H. Morgenstern", "Briana Vecchione", "Jennifer Wortman Vaughan", "Hanna M. Wallach", "Hal Daumé", "Kate Crawford"],
    353       "year": 2018,
    354       "relevance": "Proposes structured documentation for datasets, parallel to this paper's proposal for structured reporting of experimental results."
    355     },
    356     {
    357       "title": "Model Cards for Model Reporting",
    358       "authors": ["Margaret Mitchell", "Simone Wu", "Andrew Zaldivar", "Parker Barnes", "Lucy Vasserman", "Ben Hutchinson", "Elena Spitzer", "Inioluwa Deborah Raji", "Timnit Gebru"],
    359       "year": 2019,
    360       "relevance": "Proposes structured documentation for trained models, complementary to experimental results reporting."
    361     },
    362     {
    363       "title": "On the State of the Art of Evaluation in Neural Language Models",
    364       "authors": ["Gábor Melis", "Chris Dyer", "Phil Blunsom"],
    365       "year": 2018,
    366       "relevance": "Found that improvements attributed to novel architectures could be obtained through better hyperparameter tuning of existing models."
    367     },
    368     {
    369       "title": "Random Search and Reproducibility for Neural Architecture Search",
    370       "authors": ["Liam Li", "Ameet Talwalkar"],
    371       "year": 2019,
    372       "relevance": "Addresses reproducibility in neural architecture search, directly relevant to hyperparameter search methodology."
    373     }
    374   ]
    375 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs