scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18132B)
      1 {
      2   "paper": {
      3     "title": "Reproducibility in Machine Learning-based Research: Overview, Barriers and Drivers",
      4     "authors": ["Harald Semmelrock", "Tony Ross-Hellauer", "Simone Kopeinik", "Dieter Theiler", "Armin Haberl", "Stefan Thalmann", "Dominik Kowald"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2406.14325"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code repository or analysis scripts are mentioned or linked in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or corpus of reviewed literature is released. The paper does not provide a structured dataset of the barriers/drivers it identifies."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a narrative review/overview paper with no computational experiments requiring environment specifications."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions are provided for reproducing the literature review process (e.g., search queries, databases searched, inclusion criteria)."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Narrative review paper with no quantitative experiments or statistical analysis."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No statistical comparisons are made in this review paper."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No quantitative experiments; this is a narrative overview."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative experiments or sampling in this review paper."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experiments with repeated runs in this review paper."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares its contribution against prior surveys and frameworks, citing Semmelrock et al. (2023) as an earlier version and Gundersen et al.'s reproducibility taxonomy as a baseline framework."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "References include works from 2023-2024 such as Biderman et al. (2024), Kapoor & Narayanan (2023), and Koenigstorfer et al. (2024), showing engagement with recent literature."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Not applicable to a survey/overview paper — there is no system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Not applicable — no experiments with metrics in this review paper."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Not applicable — the paper makes no claims about system outputs that would require human evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Not applicable — no experimental evaluation in this review paper."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper provides per-category breakdowns of barriers and drivers across four reproducibility types (R1 Description, R2 Code, R3 Data, R4 Experiment), summarized in Table 1 and the Drivers-Barriers-Matrix."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses limitations of each driver (Table 1 lists 'Potential Limitations' for each driver) and discusses where solutions fall short."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that many reproducibility initiatives have had limited uptake, that cultural barriers persist despite technical solutions, and that incentive structures remain problematic."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the paper identifies key barriers and drivers and maps them in a matrix. The paper body delivers this through Sections 3-5 and Table 1."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper does not make causal claims — it describes barriers and proposes drivers as potential solutions, using appropriately hedged language."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper scopes its discussion to ML-based research specifically, and notes the focus on computer science and biomedical research in the conclusion. The title accurately reflects the scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "This is a narrative review/taxonomy paper presenting no empirical results, so alternative explanations are not applicable."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No models are used or evaluated in this review paper."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this review paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments with hyperparameters in this review paper."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used in this review paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe a systematic literature search methodology — no databases searched, search queries, inclusion/exclusion criteria, or filtering pipeline are documented. It appears to be a narrative rather than systematic review."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section is present. The conclusion mentions some caveats but does not constitute a substantive limitations discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The paper does not address potential biases in its own literature selection or coverage gaps."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The conclusion explicitly scopes the work to 'computer science and biomedical research' and notes 'in such a dynamic and fast-paced research area, this discussion opens up a series of further questions.' The paper also identifies AutoML as an area not fully covered."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (e.g., list of reviewed papers, search results) is provided for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not describe how the reviewed literature was collected — no search strategy, databases, or time period is specified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; this is a narrative review paper. Literature selection is covered under data_collection_described."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No pipeline from literature search to final analysis is documented. The paper presents findings without showing how the reviewed works were identified and selected."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgements section states: 'This research is supported by the Horizon Europe project TIER2 (GA: 101094817), and the FFG COMET program.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Know Center Research GmbH, Graz University of Technology, and University of Graz. No conflict with evaluated products since the paper is a review."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funded by Horizon Europe (EU research program) and FFG COMET (Austrian research funding). Neither funder has a stake in the paper's conclusions about reproducibility barriers."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model is evaluated on any benchmark in this review paper."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model is evaluated on any benchmark in this review paper."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark in this review paper."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this review paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this review paper."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this review paper."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this review paper."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this review paper."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this review paper."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this review paper."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper with no method that incurs inference cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper with no computational experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ML reproducibility is hindered by barriers across four levels: description, code, data, and experiment.",
    286       "evidence": "Sections 3-5 provide a structured analysis of barriers at each level, building on Gundersen et al.'s taxonomy. Table 1 maps drivers to barriers.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Cultural factors and misaligned incentives are a root cause of poor reproducibility in ML research.",
    291       "evidence": "Section 6 (Conclusion) argues 'the current incentives for conducting reproducible research are limited, and open research is often regarded as an unrewarded additional effort,' citing refs [22,12].",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Technical drivers (containers, version control, standardized benchmarks) can address many but not all reproducibility barriers.",
    296       "evidence": "Table 1 shows strengths and limitations of each driver. The Drivers-Barriers-Matrix maps which drivers address which barriers, showing coverage gaps.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "AutoML tools used by domain experts pose a new barrier to reproducibility that requires further research.",
    301       "evidence": "Section 6 discusses this as a future direction, citing Haberl & Thalmann (2025) [48], but provides no empirical evidence.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["meta-analysis", "qualitative"],
    306   "key_findings": "This paper provides a narrative overview of reproducibility barriers and drivers in ML-based research, organized across four reproducibility levels (description, code, data, experiment). The authors synthesize barriers into a Drivers-Barriers-Matrix showing which technical and procedural solutions address which problems. They argue that cultural change and better education are fundamental prerequisites, as technical solutions alone are insufficient. The paper identifies AutoML tools used by domain experts as an emerging reproducibility challenge.",
    307   "red_flags": [
    308     {
    309       "flag": "No systematic review methodology",
    310       "detail": "The paper presents itself as an overview but provides no description of literature search strategy, databases used, inclusion/exclusion criteria, or how papers were selected. This makes the coverage impossible to assess or reproduce — ironic for a paper about reproducibility."
    311     },
    312     {
    313       "flag": "No quality assessment of reviewed literature",
    314       "detail": "The paper summarizes and synthesizes prior work without any structured quality assessment of the papers it reviews. Barriers and drivers are presented as established facts without evaluating the strength of evidence behind each claim."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "Lessons from the trenches on reproducible evaluation of language models",
    320       "authors": ["S. Biderman", "H. Schoelkopf", "L. Sutawika"],
    321       "year": 2024,
    322       "arxiv_id": "2405.14782",
    323       "relevance": "Directly addresses reproducibility challenges specific to LLM evaluation, relevant to methodological quality in AI research."
    324     },
    325     {
    326       "title": "Leakage and the reproducibility crisis in machine-learning-based science",
    327       "authors": ["S. Kapoor", "A. Narayanan"],
    328       "year": 2023,
    329       "relevance": "Identifies data leakage as a systematic reproducibility threat in ML-based science, directly relevant to evaluation methodology."
    330     },
    331     {
    332       "title": "Holistic evaluation of language models",
    333       "authors": ["P. Liang", "R. Bommasani"],
    334       "year": 2022,
    335       "arxiv_id": "2211.09110",
    336       "relevance": "HELM benchmark for LLM evaluation, relevant to standardized evaluation methodology."
    337     },
    338     {
    339       "title": "Improving reproducibility in machine learning research (a report from the NeurIPS 2019 reproducibility program)",
    340       "authors": ["J. Pineau", "P. Vincent-Lamarre", "K. Sinha"],
    341       "year": 2021,
    342       "relevance": "Documents NeurIPS reproducibility checklist initiative, directly relevant to methodological standards in ML research."
    343     },
    344     {
    345       "title": "A Step Toward Quantifying Independently Reproducible Machine Learning Research",
    346       "authors": ["E. Raff"],
    347       "year": 2019,
    348       "relevance": "Empirical study of ML paper reproducibility rates, provides quantitative evidence on the reproducibility problem."
    349     },
    350     {
    351       "title": "Questionable practices in methodological deep learning research",
    352       "authors": ["D.J. Trosten"],
    353       "year": 2023,
    354       "relevance": "Identifies specific questionable research practices in deep learning, directly relevant to methodological quality assessment."
    355     },
    356     {
    357       "title": "Unraveling overoptimism and publication bias in ML-driven science",
    358       "authors": ["P. Saidi", "G. Dasarathy", "V. Berisha"],
    359       "year": 2024,
    360       "arxiv_id": "2405.14422",
    361       "relevance": "Examines publication bias and overoptimism in ML research, relevant to understanding systematic biases in the field."
    362     },
    363     {
    364       "title": "Opening up chatgpt: Tracking openness, transparency, and accountability in instruction-tuned text generators",
    365       "authors": ["A. Liesenfeld", "A. Lopez", "M. Dingemanse"],
    366       "year": 2023,
    367       "relevance": "Assesses transparency of LLM systems, relevant to reproducibility of LLM-based research."
    368     },
    369     {
    370       "title": "Model Cards for Model Reporting",
    371       "authors": ["M. Mitchell", "S. Wu", "A. Zaldivar"],
    372       "year": 2019,
    373       "relevance": "Proposes model documentation standards relevant to transparency and reproducibility of ML systems."
    374     },
    375     {
    376       "title": "Black box or open science? Assessing reproducibility-related documentation in AI research",
    377       "authors": ["F. Koenigstorfer", "A. Haberl", "D. Kowald"],
    378       "year": 2024,
    379       "relevance": "Empirically assesses reproducibility documentation in AI papers, directly relevant to methodological quality measurement."
    380     }
    381   ]
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs