scan.json (30892B)
1 { 2 "paper": { 3 "title": "Rigor, Reliability, and Reproducibility Matter: A Decade-Scale Survey of 572 Code Benchmarks", 4 "authors": [ 5 "Jialun Cao", 6 "Yuk-Kit Chan", 7 "Zixuan Ling", 8 "Wenxuan Wang", 9 "Shuqing Li", 10 "Mingwei Liu", 11 "Ruixi Qiao", 12 "Yuting Han", 13 "Chaozheng Wang", 14 "Boxi Yu", 15 "Pinjia He", 16 "Shuai Wang", 17 "Zibin Zheng", 18 "Michael R. Lyu", 19 "Shing-Chi Cheung" 20 ], 21 "year": 2025, 22 "venue": "arXiv preprint", 23 "arxiv_id": "2501.10711", 24 "doi": "10.48550/arXiv.2501.10711" 25 }, 26 "scan_version": 3, 27 "active_modules": ["survey_methodology"], 28 "methodology_tags": ["meta-analysis", "qualitative"], 29 "key_findings": "A decade-scale survey of 572 code benchmarks (2014–2025) reveals that despite growing awareness of benchmark quality, the absolute number of flawed benchmarks continues to rise. Key deficiencies: 48.1% lack quality assurance checks, 82.5% ignore data contamination, 84.2% don't ensure test coverage, and 64% don't repeat experiments. A human study of 49 researchers confirms awareness gaps (16% unaware of data denoising needs, 40%+ unaware of environment impact on reproducibility). The paper introduces HOW2BENCH, a 55-criteria lifecycle checklist for code benchmark development.", 30 "checklist": { 31 "artifacts": { 32 "code_released": { 33 "applies": true, 34 "answer": false, 35 "justification": "No code repository URL or analysis scripts are provided in the paper. The paper provides the HOW2BENCH checklist in Appendix G and benchmark lists in Appendices E–F, but no downloadable code or analysis tools." 36 }, 37 "data_released": { 38 "applies": true, 39 "answer": false, 40 "justification": "The full list of 572 benchmarks is provided in Appendix F and the 30 focused benchmarks in Appendix E, but these are inline in the paper only. No downloadable dataset, repository, or supplementary data file is referenced." 41 }, 42 "environment_specified": { 43 "applies": true, 44 "answer": false, 45 "justification": "No environment specifications are provided. As a survey, the paper could have specified the tools and environment used for data collection and analysis, but does not." 46 }, 47 "reproduction_instructions": { 48 "applies": true, 49 "answer": false, 50 "justification": "No reproduction instructions are provided. The survey methodology is described at a high level (Section 3.2, Figure 2), but there are no step-by-step instructions for reproducing the profiling or analysis." 51 } 52 }, 53 "statistical_methodology": { 54 "confidence_intervals_or_error_bars": { 55 "applies": true, 56 "answer": false, 57 "justification": "All statistics are reported as point estimates (e.g., '48.1%', '82.5%', '84.2%'). No confidence intervals or error bars are provided for any of the survey findings." 58 }, 59 "significance_tests": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper makes trend claims (e.g., 'the number of benchmarks that ignore code coverage when providing test cases nearly matches the total count accumulated across the previous ten years') without any significance tests. No p-values, chi-squared tests, or trend tests are reported." 63 }, 64 "effect_sizes_reported": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper consistently reports proportions with denominators providing context (e.g., '200/572 = 34.97%', '210/572 = 36.7%', '48/485 = 9.9%'). Trend comparisons give both raw counts and percentages across years (e.g., 'doubled from 60 in 2024 to 159 in 2025')." 68 }, 69 "sample_size_justified": { 70 "applies": true, 71 "answer": false, 72 "justification": "The survey covers 572 benchmarks and the human study includes 49 participants, but neither sample size is formally justified. No power analysis or representativeness argument is given for the 572 benchmarks or the 49 human participants." 73 }, 74 "variance_reported": { 75 "applies": true, 76 "answer": false, 77 "justification": "No variance, standard deviation, or spread measures are reported. All statistics are single-point percentages or counts without uncertainty quantification." 78 } 79 }, 80 "evaluation_design": { 81 "baselines_included": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper discusses related surveys (BetterBench, Koohestani et al., Chang et al.) in Section 2.2 and differentiates its scope, but does not systematically compare its findings against prior surveys' results." 85 }, 86 "baselines_contemporary": { 87 "applies": true, 88 "answer": false, 89 "justification": "No systematic baseline comparison is conducted, so contemporaneity cannot be assessed. The related works discussed (BetterBench 2024, Koohestani et al. 2025) are recent but not used as empirical baselines." 90 }, 91 "ablation_study": { 92 "applies": false, 93 "answer": false, 94 "justification": "This is a survey/position paper with no system components to ablate." 95 }, 96 "multiple_metrics": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper assesses benchmarks across multiple dimensions organized by lifecycle phases: design (4 criteria), construction (19 criteria), evaluation (12 criteria), analysis (10 criteria), and release (10 criteria), with statistics on deduplication, contamination, code coverage, repeatability, open-sourcing, and many more." 100 }, 101 "human_evaluation": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 5 and Appendix D describe a human study with 49 participants who evaluated the importance and practicality of HOW2BENCH's 55 criteria via questionnaires." 105 }, 106 "held_out_test_set": { 107 "applies": false, 108 "answer": false, 109 "justification": "Not applicable to a survey study — there is no training/test split." 110 }, 111 "per_category_breakdown": { 112 "applies": true, 113 "answer": true, 114 "justification": "Extensive breakdowns are provided: by year (Figures 8, 10, 12, 16, 21, 24, 27, 29, 33, 36, 43, 47, 49), by coding task (Figure 9), by programming language (Figure 11), by granularity (Figure 15), and by individual quality criteria throughout Section 4 and Appendix C." 115 }, 116 "failure_cases_discussed": { 117 "applies": true, 118 "answer": true, 119 "justification": "Multiple specific failure cases are discussed: MBPP duplicated subjects (Figure 22), MBPP unexecutable code (Figure 25), MBPP incorrect tests (Figure 34), MBPP out-of-capability case (Figure 18), HumanEval incorrect ground truth (Figure 30), CruxEval unclear presentation (Figure 45), XSemPLR API key leakage (Figure 55), CrossVul name/email leakage (Figure 56)." 120 }, 121 "negative_results_reported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The central finding is negative: despite growing awareness, flawed benchmarks continue to rise. The paper reports many negative findings: 48.1% no QA, 82.5% no contamination handling, 84.2% no test coverage, 64% no repeated experiments, 38.8% no prompts released, etc." 125 } 126 }, 127 "claims_and_evidence": { 128 "abstract_claims_supported": { 129 "applies": true, 130 "answer": true, 131 "justification": "All abstract claims are supported by specific statistics in the body: '48.1% benchmarks did not go through quality assurance check' (Section 4.2, Figure 23), '82.5% did not consider data contamination' (Section 4.2, Figure 26), '84.2% did not ensure a reliable judgement' (Section 4.2, Figure 32), '64.0% benchmark evaluation was one-pass' (Section 4.3, Figure 42), '38.8% did not provide essential information for reproducibility' (Section 4.5, Figure 48)." 132 }, 133 "causal_claims_justified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper claims 'the current issues not only stem from the significant effort required, but also from a lack of awareness regarding their importance' (Section 1, Section 6.2). This causal claim is based on a cross-sectional questionnaire of 49 participants, which cannot establish causation. The study design (descriptive survey + questionnaire) is inadequate for causal inference about why benchmarks are flawed." 137 }, 138 "generalization_bounded": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper claims 'Most criteria listed in HOW2BENCH can be adopted or adapted to other benchmarks such as Question-answering, mathematical reasoning, and multi-modal benchmarks' (Section 1, Contributions) and the Impact Statement claims 'Broader Influence on Machine Learning Evaluation.' These generalizations beyond code benchmarks are not empirically tested." 142 }, 143 "alternative_explanations_discussed": { 144 "applies": true, 145 "answer": true, 146 "justification": "Section 6.1 discusses the trade-off between rigor and development efficiency as an alternative explanation. The 'Alternative View' subsection in Section 1 acknowledges that 'practical constraints such as time, budget, and human resources must also be considered.' Section 6.2 discusses multiple factors behind the awareness-action gap." 147 }, 148 "proxy_outcome_distinction": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper's claims are appropriately scoped to what it measures. It reports checklist compliance rates (e.g., '48.1% did not go through quality assurance check') and frames these as specific criterion violations rather than as holistic quality scores. The paper argues these criteria matter (citing prior work showing bugs in benchmarks, contamination effects, etc.) without conflating compliance with overall quality." 152 } 153 }, 154 "setup_transparency": { 155 "model_versions_specified": { 156 "applies": false, 157 "answer": false, 158 "justification": "This is a survey paper that does not use any LLMs for its analysis." 159 }, 160 "prompts_provided": { 161 "applies": false, 162 "answer": false, 163 "justification": "This is a survey paper that does not use prompting." 164 }, 165 "hyperparameters_reported": { 166 "applies": false, 167 "answer": false, 168 "justification": "This is a survey paper with no LLM experiments requiring hyperparameters." 169 }, 170 "scaffolding_described": { 171 "applies": false, 172 "answer": false, 173 "justification": "No agentic scaffolding is used in this survey." 174 }, 175 "data_preprocessing_documented": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper describes a 4-step study design (Section 3.2, Figure 2) including collection via snowballing and profiling, but does not document specific filtering criteria, stage-by-stage counts, or how many candidate benchmarks were screened before arriving at 572. The pipeline from initial search to final 572 benchmarks is not documented with intermediate counts or explicit inclusion/exclusion criteria." 179 } 180 }, 181 "limitations_and_scope": { 182 "limitations_section_present": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 6 'Discussion' serves as a limitations section. Section 6.1 discusses 'Trade-offs Between Benchmark Rigor and Development Efficiency' and Section 6.2 discusses 'Awareness and Action.' The 'Alternative View' subsection in Section 1 also acknowledges practical constraints." 186 }, 187 "threats_to_validity_specific": { 188 "applies": true, 189 "answer": false, 190 "justification": "The discussion sections address the phenomenon being studied (trade-offs in benchmark development) rather than threats to the study's own validity. There is no discussion of potential biases in their benchmark selection, inter-rater reliability of their profiling process, or representativeness of their 49-person human study sample." 191 }, 192 "scope_boundaries_stated": { 193 "applies": true, 194 "answer": false, 195 "justification": "While the paper states its scope as 'code-related benchmarks' from '2014–2025,' it does not explicitly state what it does NOT cover or what claims it is NOT making. No specific exclusion criteria or boundary conditions are articulated (e.g., which types of code benchmarks were excluded and why)." 196 } 197 }, 198 "data_integrity": { 199 "raw_data_available": { 200 "applies": true, 201 "answer": false, 202 "justification": "The per-benchmark profiling data (whether each of 572 benchmarks meets each criterion) is not available for download. Only aggregated statistics are reported in the paper. The raw annotations cannot be independently verified." 203 }, 204 "data_collection_described": { 205 "applies": true, 206 "answer": true, 207 "justification": "Section 3.2 Step 2 describes the collection procedure: 'collecting related benchmarks according to their publication time, venue, and coding tasks, and then employing techniques like snowballing to ensure a comprehensive collection.' The profiling process is also described: 'profiling each selected benchmark through a thorough review of corresponding papers and examination of the released artifacts or homepages.'" 208 }, 209 "recruitment_methods_described": { 210 "applies": true, 211 "answer": true, 212 "justification": "Appendix D.1 describes human study participant selection: 'we chose graduate students from SE or AI fields who have published at least one paper.' It also notes targeting individuals who have published on code benchmarks. Section D.3 describes distribution via 'online platforms, targeting academic and professional networks related to SE and AI' with start and end dates (October 27 – November 27, 2024)." 213 }, 214 "data_pipeline_documented": { 215 "applies": true, 216 "answer": false, 217 "justification": "The pipeline from initial benchmark collection to the final 572 is not documented with stage counts. The paper states they used snowballing and scoped by time/venue/tasks but does not show how many candidate benchmarks were identified initially, how many were screened out, or at what stages filtering occurred." 218 } 219 }, 220 "conflicts_of_interest": { 221 "funding_disclosed": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding sources or acknowledgments section is present in the provided paper text. The paper does not disclose any funding information." 225 }, 226 "affiliations_disclosed": { 227 "applies": true, 228 "answer": true, 229 "justification": "Author affiliations are clearly listed: HKUST, CUHK, Renmin University of China, Sun Yat-Sen University, Chinese Academy of Sciences, Beijing Language and Culture University, CUHK Shenzhen. All are academic institutions." 230 }, 231 "funder_independent_of_outcome": { 232 "applies": true, 233 "answer": false, 234 "justification": "No funding information is disclosed, so independence cannot be assessed. Without knowing who funded this work, this criterion cannot be satisfied." 235 }, 236 "financial_interests_declared": { 237 "applies": true, 238 "answer": false, 239 "justification": "No competing interests or financial interests statement is present in the paper." 240 } 241 }, 242 "contamination": { 243 "training_cutoff_stated": { 244 "applies": false, 245 "answer": false, 246 "justification": "This is a survey paper that does not evaluate any pre-trained model's capability on a benchmark." 247 }, 248 "train_test_overlap_discussed": { 249 "applies": false, 250 "answer": false, 251 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 252 }, 253 "benchmark_contamination_addressed": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark. (The paper discusses contamination as a quality criterion for benchmarks it surveys, but this question asks about contamination of the paper's own evaluation.)" 257 } 258 }, 259 "human_studies": { 260 "pre_registered": { 261 "applies": true, 262 "answer": false, 263 "justification": "No pre-registration is mentioned for the human study (questionnaire with 49 participants). No link to OSF, AsPredicted, or any registry is provided." 264 }, 265 "irb_or_ethics_approval": { 266 "applies": true, 267 "answer": false, 268 "justification": "No IRB or ethics board approval is mentioned for the human study involving 49 participants." 269 }, 270 "demographics_reported": { 271 "applies": true, 272 "answer": true, 273 "justification": "Participant demographics are reported: field distribution (AI 42.6%, SE 57.14%, Figure 61), geographic distribution across seven regions including US, UK, Germany, Australia, China (Figure 60), and experience (all published at least one paper, half had constructed code benchmarks)." 274 }, 275 "inclusion_exclusion_criteria": { 276 "applies": true, 277 "answer": true, 278 "justification": "Appendix D.1 states inclusion criteria: 'graduate students from SE or AI fields who have published at least one paper.' Exclusion: 'responses from those selecting No to having published a paper were excluded. Also, incomplete surveys where not all questions were answered were also considered invalid and excluded.'" 279 }, 280 "randomization_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a cross-sectional questionnaire study, not an experimental study with treatment/control conditions. Randomization is not applicable." 284 }, 285 "blinding_described": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a cross-sectional questionnaire study, not an experimental study. Blinding is not applicable." 289 }, 290 "attrition_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Attrition is reported in Appendix D.4: 'In total, we collected 50 responses... Only one survey was invalid due to the respondent selecting have not published a paper, leaving 49 valid surveys for analysis.'" 294 } 295 }, 296 "cost_and_practicality": { 297 "inference_cost_reported": { 298 "applies": false, 299 "answer": false, 300 "justification": "This is a survey/position paper with no computational method whose cost would need reporting." 301 }, 302 "compute_budget_stated": { 303 "applies": false, 304 "answer": false, 305 "justification": "This is a survey/position paper with no significant computation requiring budget disclosure." 306 } 307 }, 308 "survey_methodology": { 309 "prisma_or_structured_protocol": { 310 "applies": true, 311 "answer": false, 312 "justification": "The paper describes a structured 4-step workflow (Figure 2: guideline construction → literature profiling → focused case study → human study) but does not follow PRISMA or reference any established systematic review protocol. No reproducible search queries, database selections, or PRISMA flow diagram are provided." 313 }, 314 "quality_assessment_of_sources": { 315 "applies": true, 316 "answer": true, 317 "justification": "The paper introduces HOW2BENCH with 55 criteria and systematically profiles 572 benchmarks against quality dimensions (deduplication, contamination, test coverage, QA checks, etc.). The 30 focused case study benchmarks (Appendix E) are assessed against the full checklist. This constitutes structured quality assessment of source benchmarks." 318 }, 319 "publication_bias_discussed": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper does not discuss publication bias — whether published benchmarks are systematically different from unpublished ones, or whether benchmark papers with positive framing are more likely to be published. No funnel plots or publication bias tests are used." 323 } 324 } 325 }, 326 "claims": [ 327 { 328 "claim": "48.1% of benchmarks did not go through quality assurance checks; 82.5% did not consider data contamination; 62% (actually 65.7%) did not deduplicate data points.", 329 "evidence": "Survey of 572 code benchmarks documented in Section 4.2 (Construction) with statistics in Figures 20, 23, 26 and Appendix C.3.", 330 "supported": "strong" 331 }, 332 { 333 "claim": "Despite rising awareness of benchmark quality, the absolute number of flawed benchmarks has continued to grow.", 334 "evidence": "Longitudinal trend data across 2014–2025 (Figures 21, 24, 27, 33, 43). For example, benchmarks ignoring code coverage: 19 (2023), 62 (2024), 102 (2025). Benchmarks without repeated experiments rose to 169 in 2025.", 335 "supported": "strong" 336 }, 337 { 338 "claim": "84.2% of benchmarks did not ensure reliable judgment (e.g., code coverage when test suites are provided); 64.0% of evaluations were one-pass without repeating experiments.", 339 "evidence": "Section 4.2 (Figure 32) shows 84.2% did not consider test coverage. Section 4.3 (Figure 42) shows 64.0% did not repeat experiments.", 340 "supported": "strong" 341 }, 342 { 343 "claim": "38.8% of benchmarks did not provide essential information (e.g., prompts) for reproducibility; 12.4% are not open source.", 344 "evidence": "Section 4.5 with Figures 46 and 48. 38.8% did not provide prompts, 12.4% not open-sourced, 2.6% only partially released.", 345 "supported": "strong" 346 }, 347 { 348 "claim": "16% of participants were unaware of the necessity for data denoising; over 40% were not aware that experimental setup and environment could impact reproducibility.", 349 "evidence": "Human study with 49 participants (Section 5, Appendix D). Questionnaire responses analyzed with results shown in Figures 62–63.", 350 "supported": "moderate" 351 }, 352 { 353 "claim": "All participants agreed that having a checklist for benchmark construction would contribute to quality; 47/55 criteria deemed important by over 80% of participants.", 354 "evidence": "Section 5 reports: 'all participants agreed that having a checklist for benchmark construction would contribute to the quality of the benchmark. 47/55 criteria in HOW2BENCH are deemed important by more 80% participants.'", 355 "supported": "moderate" 356 }, 357 { 358 "claim": "Project-level benchmarks nearly tripled from 2024 to 2025 (30 to 87), indicating growing focus on real-world applicability.", 359 "evidence": "Section 4.1 with Figure 16 showing the surge in project-level benchmarks.", 360 "supported": "strong" 361 }, 362 { 363 "claim": "HumanEval is the most significant source benchmark, benefiting at least 15 downstream benchmarks.", 364 "evidence": "Appendix C.1 with Figure 59 showing benchmark inheritance relationships.", 365 "supported": "strong" 366 } 367 ], 368 "red_flags": [ 369 { 370 "flag": "Small human study sample", 371 "detail": "The human study has only 49 participants (50 collected, 1 invalid). Claims about awareness gaps in the research community are based on this small, potentially non-representative sample recruited through convenience sampling via online platforms. No power analysis or representativeness argument is provided." 372 }, 373 { 374 "flag": "No inter-rater reliability reported", 375 "detail": "The profiling of 572 benchmarks against quality criteria appears to involve subjective judgment (e.g., whether a benchmark 'considered' contamination, whether quality assurance was adequate). The paper does not report how many raters assessed each benchmark, what the inter-rater agreement was, or how disagreements were resolved." 376 }, 377 { 378 "flag": "No systematic review protocol", 379 "detail": "Despite surveying 572 benchmarks, the paper does not follow PRISMA or any established systematic review protocol. The collection methodology (snowballing, scoping by time/venue/tasks) is described at a high level without reproducible search queries, inclusion/exclusion criteria with stage counts, or database selections." 380 }, 381 { 382 "flag": "Raw profiling data not released", 383 "detail": "The per-benchmark assessments for all 572 benchmarks are not available for independent verification. Only aggregated percentages are reported, making it impossible to check individual benchmark classifications or reproduce the analysis." 384 }, 385 { 386 "flag": "Self-referential validation", 387 "detail": "HOW2BENCH is validated by asking 49 researchers whether a checklist for benchmark construction would be useful — which almost tautologically receives agreement. The validation does not test whether benchmarks scoring higher on HOW2BENCH criteria actually produce more reliable evaluation results." 388 } 389 ], 390 "cited_papers": [ 391 { 392 "title": "SWE-bench: Can language models resolve real-world github issues?", 393 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 394 "year": 2024, 395 "relevance": "Major code benchmark for evaluating LLMs on real-world software engineering tasks; used as motivating example throughout the paper." 396 }, 397 { 398 "title": "Evaluating large language models trained on code", 399 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 400 "year": 2021, 401 "relevance": "Introduces HumanEval, the most influential code generation benchmark per the paper's analysis (benefits 15+ downstream benchmarks)." 402 }, 403 { 404 "title": "Program synthesis with large language models", 405 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 406 "year": 2021, 407 "relevance": "Introduces MBPP benchmark; used as primary case study for demonstrating quality issues (duplicated subjects, incorrect tests, unexecutable code)." 408 }, 409 { 410 "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation", 411 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 412 "year": 2023, 413 "relevance": "Found that HumanEval and MBPP contain bugs in implementation, documentation, and test cases; directly motivates the paper's concern about benchmark quality." 414 }, 415 { 416 "title": "Concerned with data contamination? Assessing countermeasures in code language model", 417 "authors": ["Jialun Cao", "Wuqi Zhang", "Shing-Chi Cheung"], 418 "year": 2024, 419 "arxiv_id": "2403.16898", 420 "relevance": "Addresses data contamination threats in code LLMs, one of the key quality dimensions assessed in HOW2BENCH." 421 }, 422 { 423 "title": "BetterBench: Assessing AI benchmarks, uncovering issues, and establishing best practices", 424 "authors": ["Anka Reuel", "Amelia Hardy", "Colin Smith", "Max Lamparth", "Malcolm Hardy", "Mykel J. Kochenderfer"], 425 "year": 2024, 426 "arxiv_id": "2411.12990", 427 "relevance": "Most closely related work: assesses 24 AI benchmarks against 46 criteria. HOW2BENCH differentiates by focusing on code benchmarks at much larger scale (572 vs 24)." 428 }, 429 { 430 "title": "Benchmarking AI models in software engineering: A review, search tool, and unified approach for elevating benchmark quality", 431 "authors": ["Reza Koohestani", "Pieter de Bekker", "Burak Koç", "Maliheh Izadi"], 432 "year": 2025, 433 "relevance": "Concurrent work reviewing AI benchmarks in software engineering, representing the growing interest in benchmark quality assessment." 434 }, 435 { 436 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 437 "authors": ["Naman Jain", "King Han", "Alex Gu"], 438 "year": 2024, 439 "arxiv_id": "2403.07974", 440 "relevance": "Contamination-free benchmark design using temporal splits, directly relevant to the paper's analysis of contamination handling in benchmarks." 441 }, 442 { 443 "title": "An empirical evaluation of using large language models for automated unit test generation", 444 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 445 "year": 2024, 446 "relevance": "Cited as part of the review methodology input and as an example of benchmark evaluation practices." 447 }, 448 { 449 "title": "Measuring coding challenge competence with APPS", 450 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 451 "year": 2021, 452 "relevance": "Major code generation benchmark used in the focused case study; cited as positive example for documentation quality (Figure 58)." 453 }, 454 { 455 "title": "The bitter lesson learned from 2,000+ multilingual benchmarks", 456 "authors": ["Minghao Wu", "Weiwei Wang", "Shiyao Liu"], 457 "year": 2025, 458 "relevance": "Related work on benchmark quality at scale, providing guidelines for evaluation practices." 459 }, 460 { 461 "title": "Benchmark^2: Systematic evaluation of LLM benchmarks", 462 "authors": ["Qianqi Qian", "Chenghua Huang", "Jiangzhuo Xu"], 463 "year": 2026, 464 "relevance": "Concurrent meta-evaluation work assessing LLM benchmarks, representing the growing field of benchmark quality assessment." 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 3, 470 "justification": "HOW2BENCH is a directly usable 55-item checklist for anyone building or reviewing code benchmarks, with a printable version in the appendix." 471 }, 472 "surprise_contrarian": { 473 "score": 2, 474 "justification": "The finding that benchmark quality has gotten worse in absolute terms despite growing awareness challenges the assumption that the community is improving." 475 }, 476 "fear_safety": { 477 "score": 0, 478 "justification": "No AI safety or security concerns are raised; the paper focuses on research methodology quality." 479 }, 480 "drama_conflict": { 481 "score": 2, 482 "justification": "Calls out widespread poor practices in benchmark construction with specific named examples (HumanEval bugs, MBPP duplicates), though framed diplomatically." 483 }, 484 "demo_ability": { 485 "score": 1, 486 "justification": "The checklist is printed in the appendix but no tool, website, or downloadable resource is provided." 487 }, 488 "brand_recognition": { 489 "score": 1, 490 "justification": "Authors are from established universities (HKUST, CUHK) but not from major AI labs; the paper does not evaluate any branded product." 491 } 492 } 493 }