scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20261B)
      1 {
      2   "paper": {
      3     "title": "Improving Reproducibility in Machine Learning Research (A Report from the NeurIPS 2019 Reproducibility Program)",
      4     "authors": ["Joelle Pineau", "Philippe Vincent-Lamarre", "Koustuv Sinha", "Vincent Larivière", "Alina Beygelzimer", "Florence d'Alché-Buc", "Emily Fox", "Hugo Larochelle"],
      5     "year": 2021,
      6     "venue": "Journal of Machine Learning Research",
      7     "arxiv_id": "2003.12206"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided for the analysis scripts used to produce the figures and statistics in the paper. Ironic given the subject matter."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The underlying data (checklist responses, reviewer scores, code submission status) from NeurIPS 2019 is not released. Only aggregated figures and statistics are shown."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment or dependency specifications are provided for the analysis."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions are provided. The reader cannot recreate the analysis from the paper alone."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as percentages and counts without confidence intervals or error bars. For example, '74.4% of papers provided code' with no uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "A significance test is reported for the association between code availability and reviewer scores: 'the availability of code at submission was positively associated with the reviewer score (p < 1e-08)' (Section 3)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The p-value for code-score association is reported without effect size. No Cohen's d, odds ratio, or magnitude of the score difference is given."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Sample sizes are stated (6743 submissions, 173 papers claimed for reproduction) but no justification or power analysis is discussed. The samples are convenience samples from the conference."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance or standard deviation is reported for any of the statistics presented."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 compares code submission rates across NeurIPS 2018, ICML 2019, and NeurIPS 2019, providing temporal baselines. Table 2 compares reproducibility challenge participation across ICLR 2018, ICLR 2019, and NeurIPS 2019."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The comparisons use recent conferences (2018-2019) which are the most relevant contemporary venues."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a descriptive report on a conference program, not a system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: code submission rates, acceptance rates, checklist response distributions, reviewer usefulness ratings, reproducibility challenge participation counts."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This paper reports on a conference program; human evaluation of system outputs is not applicable."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No ML model is being trained or evaluated; this is a descriptive observational study."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Breakdowns are provided by affiliation (academia vs. industry) in Figure 2, by checklist question in Figure 4, and by reviewer usefulness assessment in Figure 6."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6 discusses limitations including that only 34% of reviewers found the checklist useful, and acknowledges they 'do not have concluding evidence that these processes indeed have an impact on the quality of the work.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that 36% of authors judged error bars not applicable to their results (a concerning finding), and that only 34% of reviewers found the checklist useful. The Discussion acknowledges many open questions about actual effectiveness."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims are descriptive ('we describe each of these components, how it was deployed, as well as what we were able to learn') and the paper delivers on this description."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper implies causal relationships (e.g., code submission policy leading to increased code availability, checklist usefulness leading to higher scores) but the study design is observational with many confounds. Section 5 notes 'it is too early to rule out potential covariates' but still presents suggestive findings without adequate causal identification."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is explicit that findings are from NeurIPS 2019 specifically, and the Discussion carefully notes that extending to other venues would require further investigation."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5 explicitly notes potential covariates for the checklist-acceptance association ('paper's topic, reviewer expectations, etc.') and discusses the confound of the NA-acceptance correlation disappearing when filtering for reviewers who found the checklist useful."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No ML models are used in this study; it is a report on conference processes."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting of language models is involved."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No ML models are trained or evaluated in this study."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe how the raw CMT/OpenReview data was processed into the statistics and figures presented. No filtering criteria, data cleaning steps, or exclusion rules are documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 (Discussion) serves as a substantive limitations section, listing open questions and explicitly stating 'we do not have concluding evidence that these processes indeed have an impact on the quality of the work.'"
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Discussion raises specific threats: potential covariates in the checklist-acceptance association, that increased submissions may not indicate improved quality, and that cultural/organizational change is needed beyond checklists."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The Discussion lists specific open questions the study does NOT answer: long-term value of code submitted, effect of incentives on challenge participation, accuracy of self-reported checklist answers, measurable effect on paper quality."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data (checklist responses, reviewer scores, submission metadata) is not released. Only aggregated figures and summary statistics are provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described: checklist responses were collected via the CMT platform at submission and camera-ready stages, reviewer survey responses were collected at the end of the review period, and reproducibility challenge data came from OpenReview."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper describes how reproducibility challenge participants were recruited: through graduate ML courses at universities, with high participation from McGill, KTH, Brown, and IIT Roorkee. For authors and reviewers, these are naturally the NeurIPS 2019 submitters and committee."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of how raw CMT/OpenReview data was transformed into the figures and statistics. Intermediary steps between data collection and final analysis are not described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgment of grants is present. The Acknowledgments section thanks people and organizations for support but does not disclose funding sources."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed, including dual affiliations with industry (Facebook AI Research, Google, Yahoo! Research, Apple)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. Several authors are affiliated with companies (Facebook, Google, Apple, Yahoo) that have stakes in ML research practices, but this potential conflict is not discussed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model is evaluated on any benchmark. This is an observational study of conference processes."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model is evaluated on any benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The reproducibility program involved surveys of authors and reviewers (human participants), but no pre-registration is mentioned."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The study analyzed responses from thousands of authors and reviewers but no IRB or ethics approval is mentioned."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Participant demographics are partially reported: affiliation type (academia vs. industry) in Figure 2, geographic distribution of reproducibility challenge participants in Figure 8, and institutional breakdown in Section 4."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion/exclusion criteria are stated for which submissions or reviewer responses were included in the analysis."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "This is an observational study of a conference process, not a randomized experiment. Assignment to conditions is not applicable."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not an experimental study with conditions requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Table 2 reports that 173 papers were claimed for reproduction and 84 reports were reviewed, showing attrition. Figure 3(b) shows the proportion of self-reported code submissions confirmed by reviewers."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a report on conference processes, not a method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a report on conference processes, not a computationally intensive method."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Code submission rates increased from less than 50% (NeurIPS 2018) to 74.4% at camera-ready (NeurIPS 2019) with a voluntary code submission policy.",
    286       "evidence": "Table 1 shows NeurIPS 2018 had <50% code at submission, while NeurIPS 2019 had 40% at submission and 74.4% at camera-ready. Section 3.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Code availability at submission was positively associated with higher reviewer scores (p < 1e-08).",
    291       "evidence": "Section 3 states this p-value directly, though no effect size is provided.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "One-third of reviewers found the ML reproducibility checklist useful for evaluating submissions.",
    296       "evidence": "Section 5 reports that 34% of reviewers responded 'Yes' when asked if checklist answers were useful.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Participation in the reproducibility challenge increased 92% from ICLR 2019 to NeurIPS 2019.",
    301       "evidence": "Table 2 shows 90 papers claimed at ICLR 2019 vs. 173 at NeurIPS 2019. Section 4.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Reviewers who found the checklist useful gave higher scores to papers.",
    306       "evidence": "Figure 6(a) shows this relationship. However, the direction of causality is unclear — it could be that better papers had better checklist responses.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["observational", "case-study"],
    311   "key_findings": "The NeurIPS 2019 reproducibility program deployed three mechanisms: a voluntary code submission policy (achieving 74.4% compliance at camera-ready), a reproducibility challenge (173 papers claimed, 84 reports reviewed), and an ML reproducibility checklist. Code availability was associated with higher reviewer scores (p < 1e-08), and 34% of reviewers found the checklist useful. The paper honestly acknowledges it cannot establish that these mechanisms improved actual research quality, and identifies several open questions for future investigation.",
    312   "red_flags": [
    313     {
    314       "flag": "No raw data released",
    315       "detail": "A paper about reproducibility does not release its own underlying data (checklist responses, reviewer scores, submission metadata), making its own results not independently verifiable."
    316     },
    317     {
    318       "flag": "No analysis code released",
    319       "detail": "The analysis scripts used to generate figures and statistics are not provided, despite the paper advocating for code release."
    320     },
    321     {
    322       "flag": "Causal language with observational data",
    323       "detail": "Associations between code submission/checklist use and acceptance rates are presented suggestively, though the paper does note covariates cannot be ruled out."
    324     },
    325     {
    326       "flag": "Self-reported data without validation",
    327       "detail": "Checklist responses are self-reported by authors. Figure 3(b) shows reviewer confirmation rates but notes discrepancies, and the paper itself asks 'What is the accuracy of the ML checklist answers?' as an open question."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Deep reinforcement learning that matters",
    333       "authors": ["Peter Henderson", "Riashat Islam", "Philip Bachman", "Joelle Pineau", "Doina Precup", "David Meger"],
    334       "year": 2018,
    335       "relevance": "Foundational paper on reproducibility failures in deep RL, motivating the need for reproducibility standards."
    336     },
    337     {
    338       "title": "A step toward quantifying independently reproducible machine learning research",
    339       "authors": ["Edward Raff"],
    340       "year": 2019,
    341       "relevance": "Empirical study quantifying ML reproducibility rates, directly relevant to reproducibility methodology assessment."
    342     },
    343     {
    344       "title": "Unreproducible research is reproducible",
    345       "authors": ["Xavier Bouthillier", "César Laurent", "Pascal Vincent"],
    346       "year": 2019,
    347       "relevance": "Studies variance in ML results due to implementation choices, relevant to understanding reproducibility challenges."
    348     },
    349     {
    350       "title": "Are GANs created equal? A large-scale study",
    351       "authors": ["Mario Lucic", "Karol Kurach", "Marcin Michalski", "Sylvain Gelly", "Olivier Bousquet"],
    352       "year": 2018,
    353       "relevance": "Large-scale benchmark evaluation study examining whether reported GAN improvements are real, relevant to evaluation methodology."
    354     },
    355     {
    356       "title": "State of the art: Reproducibility in artificial intelligence",
    357       "authors": ["Odd Erik Gundersen", "Sigbjørn Kjensmo"],
    358       "year": 2018,
    359       "relevance": "Survey of reproducibility practices in AI research, directly relevant to methodology quality assessment."
    360     },
    361     {
    362       "title": "On reproducible AI: Towards reproducible research, open science and digital scholarship in AI publications",
    363       "authors": ["Odd Erik Gundersen", "Yolanda Gil", "David W Aha"],
    364       "year": 2018,
    365       "relevance": "Develops a checklist for AI publications covering data, code, methods, and experiments — directly comparable to this survey's scope."
    366     },
    367     {
    368       "title": "A manifesto for reproducible science",
    369       "authors": ["Marcus R Munafò", "Brian A Nosek"],
    370       "year": 2017,
    371       "relevance": "Influential position paper on reproducibility across sciences, provides framework for understanding methodological quality."
    372     },
    373     {
    374       "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    375       "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"],
    376       "year": 2018,
    377       "arxiv_id": "1810.04805",
    378       "relevance": "Referenced as an example of large-scale ML work that was successfully reproduced despite high compute requirements."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs