ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (16466B)


      1 {
      2   "paper": {
      3     "title": "Questionable practices in machine learning",
      4     "authors": ["Gavin Leech", "Juan J Vazquez", "Niclas Kupper", "Misha Yagudin", "Laurence Aitchison"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2407.12220"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is mentioned in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or structured data artifact is released. The paper is a taxonomy with examples drawn from existing literature."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a qualitative taxonomy paper with no computational experiments requiring an environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No experiments to reproduce; this is a conceptual taxonomy paper."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "The paper does not run experiments or report quantitative results of its own."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No comparative quantitative claims are made by the authors from their own experiments."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original experiments; effect sizes cited are from other papers."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No empirical study with a sample conducted by the authors."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experimental runs to report variance over."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares to and extends prior work on QRPs, notably Wicherts et al. (2016) from psychology and Biderman et al. (2024) from ML. Section 2 discusses related work extensively."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Related work includes contemporary references such as Biderman et al. (2024), Kapoor et al. (2024), and Hofman et al. (2023)."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system with components to ablate; this is a taxonomy paper."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No quantitative evaluation is performed."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate. This is a conceptual taxonomy."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No dataset or test set used."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The taxonomy is organized into clear categories: contamination (10 types), cherrypicking (8 types), misreporting (12 types), amplifiers (3 types), and irreproducible research practices (8 types), presented in Tables 1 and 2."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The entire paper consists of discussing failure cases — questionable practices with concrete examples from published work."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.4 Limitations acknowledges the paper does not quantify prevalence or severity of QRPs, and notes the taxonomy is not exhaustive."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to describe 44 QRPs with examples, which the paper delivers across Sections 3 and 4 with Tables 1 and 2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper describes and categorizes practices rather than making causal claims. Causal examples cited (e.g., contamination causing score inflation) reference others' experimental findings."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 5.4 explicitly states: 'We do not expect Table 1 to be exhaustive' and 'this work does not quantify the prevalence or severity of the QRPs, so we cannot tell you how much to worry.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5.3 discusses root causes from multiple angles (researcher incentives vs. industrialization). The paper also consistently notes when QRPs could be accidental vs. intentional (Table 1 'Accidental?' column)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No models are used or evaluated by the authors."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments conducted requiring hyperparameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe a systematic methodology for how the 44 QRPs were identified, collected, or validated. No search strategy, inclusion/exclusion criteria, or literature review protocol is documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.4 'Limitations' is a dedicated subsection discussing the paper's shortcomings."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5.4 states specific limitations: the taxonomy is not exhaustive, the paper cannot quantify prevalence or severity, and it acknowledges dual-use risk."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The introduction states: 'we do not claim that most performance is spurious. Nor do we show the general prevalence of these problems. This paper answers the limited question \"what could make a model's reported performance to some extent spurious?\"'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No structured dataset of QRPs, examples, or references is released for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not describe how the 44 QRPs were systematically identified. Section 2.1 mentions Twitter as a source of leads, but no formal collection methodology is described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants recruited; this is a literature-based taxonomy."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No pipeline from literature search to final taxonomy is documented. The selection and organization of QRPs appears ad hoc."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 6.1 states: 'GL is funded by the UKRI Centre for Doctoral Training in Interactive Artificial Intelligence (EP/S022937/1).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Arb Research, University of Bath, University of Bristol."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "UKRI doctoral training funding has no stake in the outcome of a taxonomy of questionable research practices."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate any pre-trained model on a benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation on benchmarks performed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No model evaluation on benchmarks performed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a taxonomy/survey paper with no computational method to cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No computation performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "44 questionable research practices can undermine reported ML results, falling into contamination, cherrypicking, and misreporting categories.",
    286       "evidence": "Tables 1 and 2, Sections 3 and 4 enumerate and describe all 44 practices with examples from published work.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Data contamination causes large changes in benchmark performance — e.g., Gemini 1.0 Ultra increased HumanEval from 74.4% to 89.0% when exposed to the test set once in pre-training.",
    291       "evidence": "Section 3.1.1 cites Reid et al. (2024) for this specific result.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The industrialization of AI research creates incentives misaligned with scientific evaluation norms.",
    296       "evidence": "Section 5.3.2 discusses how business goals (marketing, investment) diverge from scientific goals (fair comparison, generalization).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Subtle prompt and harness differences can cause ~30% performance swings, as shown by the MMLU scoring discrepancy across three evaluation harnesses.",
    301       "evidence": "Section 3.2.6 describes how Llama-65b's MMLU score varied by nearly 30% between the EleutherAI harness and the original MMLU/HELM harnesses (Fourrier et al., 2023).",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["qualitative"],
    306   "key_findings": "The paper catalogs 44 questionable research practices (QRPs) in ML evaluation organized into contamination (10 types), cherrypicking (8 types), misreporting (12 types), and amplifiers (3 types), plus 9 irreproducible research practices. It provides concrete examples from published work including Gemini, GPT-4, Phi-3, and Falcon launches. The paper identifies two root causes: researcher self-certification of SOTA results and the industrialization of AI research, and proposes defenses including standardized evaluation harnesses, private benchmarks, and preregistration.",
    307   "red_flags": [
    308     {
    309       "flag": "No systematic methodology for taxonomy construction",
    310       "detail": "The 44 QRPs appear to be collected ad hoc from literature and Twitter rather than through a systematic review process. No search strategy, inclusion criteria, or saturation analysis is described."
    311     },
    312     {
    313       "flag": "No prevalence or severity estimates",
    314       "detail": "The paper acknowledges in Section 5.4 that it cannot quantify how common or damaging these practices are, limiting its utility for assessing the actual state of ML research quality."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models",
    320       "authors": ["Stella Biderman"],
    321       "year": 2024,
    322       "relevance": "Directly addresses methodological problems in LLM evaluation, a core concern of this survey."
    323     },
    324     {
    325       "title": "AI Agents That Matter",
    326       "authors": ["Sayash Kapoor"],
    327       "year": 2024,
    328       "relevance": "Discusses evaluation methodology issues for AI agents including cost reporting and baseline fairness."
    329     },
    330     {
    331       "title": "Troubling Trends in Machine Learning Scholarship",
    332       "authors": ["Zachary C. Lipton", "Jacob Steinhardt"],
    333       "year": 2019,
    334       "relevance": "Foundational work on inflated claims and poor methodology in ML research."
    335     },
    336     {
    337       "title": "Are We Learning Yet? A Meta-Review of Evaluation Failures Across Machine Learning",
    338       "authors": ["Thomas Liao"],
    339       "year": 2021,
    340       "relevance": "Studies the mismatch between benchmarks and real-world problems in ML evaluation."
    341     },
    342     {
    343       "title": "Holistic Evaluation of Language Models",
    344       "authors": ["Percy Liang"],
    345       "year": 2022,
    346       "relevance": "Major evaluation framework (HELM) for LLMs addressing standardization of evaluation."
    347     },
    348     {
    349       "title": "How to Avoid Machine Learning Pitfalls: a Guide for Academic Researchers",
    350       "authors": ["Michael A. Lones"],
    351       "year": 2021,
    352       "relevance": "Systematic guide of ML anti-patterns complementary to this paper's QRP taxonomy."
    353     },
    354     {
    355       "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference",
    356       "authors": ["Wei-Lin Chiang"],
    357       "year": 2024,
    358       "relevance": "Major human-preference evaluation platform discussed as a defense against contamination and reification."
    359     },
    360     {
    361       "title": "GPT-4 technical report",
    362       "authors": ["Josh Achiam"],
    363       "year": 2023,
    364       "arxiv_id": "2303.08774",
    365       "relevance": "Key reference for reported contamination in frontier LLMs and evaluation methodology."
    366     },
    367     {
    368       "title": "Evaluating large language models trained on code",
    369       "authors": ["Mark Chen"],
    370       "year": 2021,
    371       "arxiv_id": "2107.03374",
    372       "relevance": "Introduces HumanEval benchmark, a central example in contamination discussions."
    373     },
    374     {
    375       "title": "Measuring Data Contamination in Large-Scale Benchmarks",
    376       "authors": ["Riddell"],
    377       "year": 2024,
    378       "relevance": "Demonstrates contamination in popular open-source training corpora (The Pile, The Stack) with HumanEval."
    379     }
    380   ]
    381 }

Impressum · Datenschutz