scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19502B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Rigor, Reliability, and Reproducibility Matter: A Decade-Scale Survey of 572 Code Benchmarks",
      6     "authors": [
      7       "Jialun Cao",
      8       "Yuk-Kit Chan",
      9       "Zixuan Ling",
     10       "Wenxuan Wang",
     11       "Shuqing Li",
     12       "Mingwei Liu",
     13       "Ruixi Qiao",
     14       "Yuting Han",
     15       "Chaozheng Wang",
     16       "Boxi Yu",
     17       "Pinjia He",
     18       "Shuai Wang",
     19       "Zibin Zheng",
     20       "Michael R. Lyu",
     21       "Shing-Chi Cheung"
     22     ],
     23     "year": 2025,
     24     "venue": "arXiv preprint",
     25     "arxiv_id": "2501.10711",
     26     "doi": "10.48550/arXiv.2501.10711"
     27   },
     28   "checklist": {
     29     "claims_and_evidence": {
     30       "abstract_claims_supported": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "All statistics quoted in the abstract (82.5% contamination, 84.2% no coverage, 48.1% no QA, 38.8% no prompts) are backed by the full analysis of 572 benchmarks detailed in Appendix C with figures.",
     34         "source": "haiku"
     35       },
     36       "causal_claims_justified": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The main causal claim — that issues stem from lack of awareness plus resource constraints — is supported by a 49-participant human study showing explicit awareness gaps (16% unaware of denoising need, 40%+ unaware of environment documentation importance), which is adequate for this modest claim.",
     40         "source": "haiku"
     41       },
     42       "generalization_bounded": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly scopes its empirical analysis to code-related benchmarks (2014–2025) and separately argues in the Impact Statement that principles may extend to other ML domains, clearly distinguishing the two.",
     46         "source": "haiku"
     47       },
     48       "alternative_explanations_discussed": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6.1 explicitly addresses 'An Alternative View' from two participants who stressed that practical constraints (time, budget, resources) — not just lack of awareness — explain flawed benchmarks, and the paper engages with this tension directly.",
     52         "source": "haiku"
     53       },
     54       "proxy_outcome_distinction": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper measures presence/absence of concrete benchmark practices (deduplication, contamination handling, repeated experiments) and claims these directly threaten benchmark validity — the measured constructs match the claimed constructs without unexamined proxy leaps.",
     58         "source": "haiku"
     59       }
     60     },
     61     "limitations_and_scope": {
     62       "limitations_section_present": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "There is no dedicated limitations or threats-to-validity section; Section 6 discusses trade-offs between rigor and efficiency but is framed as a discussion of benchmark developers' constraints, not the study's own methodological limitations.",
     66         "source": "haiku"
     67       },
     68       "threats_to_validity_specific": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper does not discuss threats to its own survey methodology — e.g., potential selection bias from snowballing, rater reliability in manually profiling 572 benchmarks, or the representativeness of its 49-person human study sample.",
     72         "source": "haiku"
     73       },
     74       "scope_boundaries_stated": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper explicitly bounds its scope to code-related benchmarks from 2014–2025 and notes in the Impact Statement that the empirical analysis is limited to this domain, while conceptual arguments may apply more broadly.",
     78         "source": "haiku"
     79       }
     80     },
     81     "conflicts_of_interest": {
     82       "funding_disclosed": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding acknowledgment appears anywhere in the provided paper text; multiple authors are affiliated with major funded institutions but no grants or sponsors are mentioned.",
     86         "source": "haiku"
     87       },
     88       "affiliations_disclosed": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "All 15 authors have institutional affiliations listed in footnote 1, spanning HKUST, CUHK, Renmin University, Sun Yat-Sen University, Chinese Academy of Sciences, and others.",
     92         "source": "haiku"
     93       },
     94       "funder_independent_of_outcome": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "No funding source is disclosed, so independence cannot be assessed.",
     98         "source": "haiku"
     99       },
    100       "financial_interests_declared": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No competing interests or financial interests statement appears in the paper.",
    104         "source": "haiku"
    105       }
    106     },
    107     "scope_and_framing": {
    108       "key_terms_defined": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper defines rigor, reliability, and reproducibility operationally through the five benchmark lifecycle phases, and Section 3.1 provides concrete definitions of each development phase with specific criteria.",
    112         "source": "haiku"
    113       },
    114       "intended_contribution_clear": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper states five contributions explicitly in Section 1: HOW2BENCH guideline, significance as first comprehensive actionable guideline, usefulness for practitioners, generalizability, and long-term community impact.",
    118         "source": "haiku"
    119       },
    120       "engagement_with_prior_work": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 2 systematically contrasts this work with BetterBench and other surveys along scope, lifecycle coverage, scale (572 vs. 24 benchmarks), and objectives; the paper positions itself as revealing longitudinal trends that prior guidelines cannot.",
    124         "source": "haiku"
    125       }
    126     }
    127   },
    128   "type_checklist": {
    129     "survey": {
    130       "search_and_selection": {
    131         "search_strategy_reproducible": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper describes collecting benchmarks 'according to their publication time, venue, and coding tasks' with snowballing, but provides no specific search strings, database names, or queries that would allow someone to replicate the initial collection.",
    135           "source": "haiku"
    136         },
    137         "inclusion_exclusion_explicit": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No formal inclusion/exclusion criteria are stated; the paper says it collected 'code-related benchmarks' but does not define what qualifies as code-related or specify any exclusion rules.",
    141           "source": "haiku"
    142         },
    143         "prisma_or_structured_protocol": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "The 4-step study design (guideline construction, literature profiling, case study, human study) is described as a workflow but does not follow PRISMA or any other named systematic review protocol.",
    147           "source": "haiku"
    148         },
    149         "search_terms_provided": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No search terms or queries are provided anywhere in the paper; only 'snowballing' from existing benchmarks is mentioned as the collection technique.",
    153           "source": "haiku"
    154         },
    155         "databases_listed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No databases (e.g., arXiv, ACM DL, IEEE Xplore, Google Scholar) are explicitly listed as sources for benchmark discovery.",
    159           "source": "haiku"
    160         },
    161         "screening_process_documented": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The paper reports 572 final benchmarks but provides no flow diagram or counts showing how many were initially identified, screened, and excluded at each stage.",
    165           "source": "haiku"
    166         },
    167         "review_scope_justified": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "The paper implicitly justifies the 2014–2025 scope by anchoring to Defects4J (2014) as the earliest benchmark and the emergence of LLMs; the focus on code benchmarks is justified in the introduction by their critical role in shaping understanding of LLM capabilities.",
    171           "source": "haiku"
    172         }
    173       },
    174       "synthesis_quality": {
    175         "conflicting_findings_acknowledged": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "The paper acknowledges positive trends (manual QA doubling, more LLMs evaluated, growing open-source releases) alongside the dominant negative finding of persistent quality gaps, and Section 6.1 explicitly presents an 'Alternative View' from practitioners.",
    179           "source": "haiku"
    180         },
    181         "quality_assessment_of_sources": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "The 55-item HOW2BENCH checklist is applied to all 572 benchmarks to generate quality statistics, constituting a structured quality assessment of source papers; a focused case study applies the full checklist to 30 representative benchmarks.",
    185           "source": "haiku"
    186         },
    187         "publication_bias_discussed": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "The paper does not discuss publication bias — e.g., that flawed benchmarks may be less likely to be published or cited, which could distort the observed quality distribution.",
    191           "source": "haiku"
    192         },
    193         "quantitative_synthesis_present": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The paper provides extensive quantitative synthesis: percentages of benchmarks meeting each criterion, year-over-year trend figures (Figures 8–59), and counts across all five lifecycle phases for all 572 benchmarks.",
    197           "source": "haiku"
    198         },
    199         "recommendations_supported_by_evidence": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Each HOW2BENCH criterion is introduced alongside key statistics showing how many benchmarks violate it and why this matters (e.g., 84.2% ignore code coverage → allows incorrect code to pass tests), grounding recommendations in observed prevalence.",
    203           "source": "haiku"
    204         }
    205       }
    206     }
    207   },
    208   "claims": [
    209     {
    210       "claim": "82.5% of 572 code benchmarks do not consider or handle data contamination threats.",
    211       "evidence": "Figure 26 and Section 4.2 report this statistic directly from the full analysis of 572 benchmarks.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "84.2% of benchmarks using test suites did not consider code coverage, threatening evaluation validity.",
    216       "evidence": "Figure 32 and Section 4.2 report this with year-over-year breakdown (19, 62, 102 non-compliant benchmarks in 2023–2025).",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Despite growing awareness, the absolute number of flawed benchmarks continues to rise because benchmark production is growing faster than quality practices.",
    221       "evidence": "Year-over-year figures throughout Appendix C show proportions improving slightly while absolute counts worsen; 271 benchmarks produced in 2025 alone.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "48.1% of benchmarks underwent no quality assurance checks whatsoever.",
    226       "evidence": "Figure 23 and Section 4.2 report this directly from the full corpus analysis.",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "64% of benchmark evaluations were not repeated to control for LLM randomness.",
    231       "evidence": "Figure 42 and Section 4.3 report this; year-over-year Figure 43 shows 169 non-repeating benchmarks in 2025 alone.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "Benchmark quality problems stem partly from lack of awareness, not just resource constraints.",
    236       "evidence": "Human study (n=49): 16% unaware of denoising necessity, 40%+ believe environment documentation is unimportant; however, convenience sample from authors' networks limits generalizability.",
    237       "supported": "moderate"
    238     },
    239     {
    240       "claim": "18% of benchmarks act as data sources for downstream benchmarks, meaning quality flaws propagate through the benchmark ecosystem.",
    241       "evidence": "Figure 59 (benchmark inheritance graph) shows HumanEval benefiting at least 15 downstream benchmarks; this structural point is well-supported by the citation graph.",
    242       "supported": "strong"
    243     }
    244   ],
    245   "methodology_tags": [
    246     "observational",
    247     "meta-analysis",
    248     "qualitative"
    249   ],
    250   "key_findings": "A decade-scale analysis of 572 code-related benchmarks (2014–2025) finds a persistent and widening gap between community awareness of benchmark quality issues and actual practice: 82.5% ignore data contamination, 84.2% omit code coverage, 64% don't repeat experiments, and 15% are not fully open-sourced. Despite these alarming proportions, the absolute count of flawed benchmarks is accelerating because benchmark production volume is growing faster than quality practices. A 49-person human study reveals dual causes: resource constraints and genuine awareness gaps, motivating the HOW2BENCH 55-criterion lifecycle checklist as a corrective tool. Positive trends include doubling of manual QA checks and growing open-sourced releases from 2024 to 2025.",
    251   "red_flags": [
    252     {
    253       "flag": "Non-reproducible collection",
    254       "detail": "No search strings, databases, or formal inclusion criteria are provided; snowballing is described but not operationalized, making the benchmark corpus impossible to independently replicate."
    255     },
    256     {
    257       "flag": "No screening flow documentation",
    258       "detail": "The paper reports 572 final benchmarks but provides no PRISMA-style flow showing how many were initially found, screened, and excluded at each stage."
    259     },
    260     {
    261       "flag": "No limitations section",
    262       "detail": "The study's own methodological threats (rater reliability for profiling 572 benchmarks, sampling bias from snowballing, subjective judgment in assigning quality labels) are never discussed."
    263     },
    264     {
    265       "flag": "Small convenience-sampled human study",
    266       "detail": "The 49-participant questionnaire was distributed via 'academic and professional networks' of the authors, creating selection bias; all were from SE/AI fields and most from Asia, limiting generalizability."
    267     },
    268     {
    269       "flag": "Focused case study selection unclear",
    270       "detail": "30 'representative' benchmarks were selected by 'top-5 citation plus latest 1' per task, but this citation-based selection likely over-represents high-quality benchmarks and may not be representative of the full distribution."
    271     },
    272     {
    273       "flag": "No inter-rater reliability reported",
    274       "detail": "Profiling 572 benchmarks against 55 criteria was performed by the research team but no inter-rater agreement scores are reported, raising concerns about consistency in how criteria were applied."
    275     }
    276   ],
    277   "cited_papers": [
    278     {
    279       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    280       "relevance": "Primary benchmark example used throughout; represents state-of-the-art code benchmark being evaluated by HOW2BENCH criteria"
    281     },
    282     {
    283       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    284       "relevance": "Most cited source benchmark in the ecosystem (benefits 15+ downstream benchmarks); used as case study for quality issues including incorrect ground truth"
    285     },
    286     {
    287       "title": "Program Synthesis with Large Language Models (MBPP)",
    288       "relevance": "Second most influential source benchmark; used as case study for duplicated tasks, incorrect tests, and out-of-scope capability items"
    289     },
    290     {
    291       "title": "BetterBench: Assessing AI Benchmarks, Uncovering Issues, and Establishing Best Practices",
    292       "relevance": "Primary related work; directly compared against HOW2BENCH to distinguish scope, scale, and objectives"
    293     },
    294     {
    295       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (EvalPlus/HumanEval+)",
    296       "relevance": "Key motivating work showing insufficient test coverage allows incorrect code to pass benchmark evaluations"
    297     },
    298     {
    299       "title": "NLP Evaluation in Trouble: On the Need to Measure LLM Data Contamination for Each Benchmark",
    300       "relevance": "Foundational work on data contamination threats cited to motivate contamination handling criterion"
    301     },
    302     {
    303       "title": "A Survey on Evaluation of Large Language Models",
    304       "relevance": "Most directly related survey on LLM evaluation; paper positions HOW2BENCH as complementary with different scope (code-specific, benchmark construction vs. evaluation methodology)"
    305     },
    306     {
    307       "title": "Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality",
    308       "relevance": "Concurrent related work on SE benchmark assessment; cited as complementary guideline effort"
    309     }
    310   ],
    311   "engagement_factors": {
    312     "practical_relevance": {
    313       "score": 3,
    314       "justification": "HOW2BENCH is a directly printable 55-item checklist practitioners can apply before and after building code benchmarks; immediately actionable."
    315     },
    316     "surprise_contrarian": {
    317       "score": 2,
    318       "justification": "The finding that awareness has not translated into practice — and that the absolute number of flawed benchmarks is accelerating — challenges the optimistic narrative that community awareness alone drives improvement."
    319     },
    320     "fear_safety": {
    321       "score": 1,
    322       "justification": "Raises concern that flawed benchmarks mislead research directions and hinder meaningful progress, but does not frame this as an AI safety issue per se."
    323     },
    324     "drama_conflict": {
    325       "score": 1,
    326       "justification": "Calls out widely-used benchmarks (HumanEval, MBPP) for specific quality failures, but tone is constructive rather than accusatory."
    327     },
    328     "demo_ability": {
    329       "score": 2,
    330       "justification": "The paper includes a printable version of HOW2BENCH (Appendix G) that readers can immediately apply to their own benchmark projects."
    331     },
    332     "brand_recognition": {
    333       "score": 1,
    334       "justification": "Authors are from well-regarded SE groups (HKUST, CUHK) but no major industry lab; paper does not involve a famous product or model."
    335     }
    336   },
    337   "hn_data": {
    338     "threads": [
    339       {
    340         "hn_id": "43071030",
    341         "title": "Infrastructure for AI Agents",
    342         "points": 3,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=43071030"
    345       },
    346       {
    347         "hn_id": "42867278",
    348         "title": "Large Language Model Training Using FP4 Quantization",
    349         "points": 2,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=42867278"
    352       }
    353     ],
    354     "top_points": 3,
    355     "total_points": 5,
    356     "total_comments": 0
    357   }
    358 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs