scan.json (22883B)
1 { 2 "paper": { 3 "title": "Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality", 4 "authors": ["Roham Koohestani", "Philippe de Bekker", "Begüm Koç", "Maliheh Izadi"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.05860" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository provided at https://github.com/AISE-TUDelft/AI4SE-benchmarks. The paper states 'we publicly release the material of our review, user study, and the enhanced benchmark.'" 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The review data, user study data, and 50% of the manually refined HumanEvalNext benchmark are publicly released via the GitHub repository. The paper states full benchmark will be available upon acceptance." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using 'one NVIDIA A100 80GB GPU and 32 CPU cores' for inference but does not provide requirements.txt, Dockerfile, or detailed library versions for reproducing the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The experimental setup section describes the general approach but lacks specific commands or scripts to replicate the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Pass@1 results are reported as point estimates without confidence intervals or error bars. The boxplot in Figure 7 shows distributions but individual model results lack uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "The agentic pipeline noninferiority study uses one-sided one-sample t-tests and Wilcoxon signed-rank tests with reported p-values. However, no significance tests are applied to the main HumanEval vs HumanEvalNext performance comparison claims." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Performance drops are reported with baseline context: e.g., 'average pass@1 score decreases by 31.2%' from specific baselines (e.g., Nxcode-CQ from 87.23% to 51.22%). Mean ratings with SD reported for noninferiority study (0.16, SD=0.67)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the choice of 10 models, 22 user study participants, or 100 MBPP problems for the generalizability study. No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard deviations are reported for the noninferiority study (SD=0.67, 0.51, 0.49). The boxplot in Figure 7 shows the distribution of pass@1 drops across models." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "HumanEvalNext is compared against the original HumanEval and HumanEvalPlus benchmarks. The agentic pipeline is compared against the human-improved version." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The 10 evaluated models are mostly 6-15B parameter open-source models. No frontier models (GPT-4, Claude, Gemini) are evaluated, and the paper acknowledges this gap in the discussion section." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study isolates the contribution of individual BenchFrame improvements (corrected solutions, added type annotations, improved tests, edge cases). All changes are bundled together." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only pass@1 is used as the evaluation metric for model performance on the benchmarks. No other metrics (e.g., pass@5, pass@10, execution time) are reported." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "The user study with 22 participants evaluated BenchScout. Two independent reviewers rated human vs. agentic benchmark improvements on a 5-point ordinal scale for the noninferiority study." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is not a machine learning training study; the benchmarks are evaluation instruments, not train/test splits for model development." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Per-model breakdowns are provided in Tables XXII-XXV. Per-category distribution of benchmarks is shown in Figure 3. Per-problem difficulty ranking is discussed." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses specific failure cases: CodeQwen1.5 dropping from 87.2% to 10.98%, specific HumanEval Task 47 errors, and pitfalls of the agentic pipeline (docstring leaking solutions, unintended assumptions)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The agentic pipeline's pitfalls are reported (docstrings revealing solutions, canonical solutions making unintended assumptions, test format issues). The paper also shows the agentic version sometimes performs worse than human version (Table XXIV)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of 31.22% and 19.94% pass@1 drops are supported by Tables XXII-XXIII. User study scores of 4.5, 4.0, and 4.1 are supported by the questionnaire results. 273 benchmarks from 247 studies is supported by Section III." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper implies that performance drops are caused by better benchmark quality ('enhanced difficulty and refined evaluation precision'), but confounds exist: added edge cases, corrected errors, and increased test count are not isolated. The dramatic CodeQwen1.5 drop (87.2% to 10.98%) is attributed to data leakage without direct evidence." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper proposes BenchFrame as a general methodology but validates it only on HumanEval and 100 MBPP problems, both Python-only code generation tasks. Claims like 'one can extend it to other datasets with similar success' go beyond the evidence. The title claims 'Benchmarking AI Models in Software Engineering' broadly." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The threats to validity section discusses generic methodological concerns (selection bias, generalizability) but does not discuss specific alternative explanations for the performance drops, such as whether test case inflation rather than quality improvement drives the results." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Full HuggingFace model identifiers are specified (e.g., 'NTQAI/Nxcode-CQ-7B-orpo', 'deepseek-ai/deepseek-coder-6.7b-instruct'). The agentic pipeline uses 'o3-mini-2025-01-31' with specific date." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes the prompting approach ('an instructional preamble asking the model to finish the implementation') but does not provide the actual prompt text used for evaluation." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, or other sampling parameters are reported for the model evaluations. A 15-second timeout is mentioned but core inference parameters are absent." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The agentic pipeline for BenchFrame is described in detail with a flowchart (Figure 8) showing the three phases (text, code, test improvement), validation loop, retry logic (3 attempts), and fallback to minimal passing tests." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The literature review pipeline is documented: search platforms (Google Scholar, Semantic Scholar), keywords, selection criteria (originality, reproducibility, accessibility), duplicate removal, forward/backward snowballing, resulting in 247 papers. Inter-rater agreement of 96.4% is reported." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section VI-E 'Threats to the Validity' discusses construct, internal, and external validity threats in detail." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats are discussed: the 22-participant sample may limit generalizability, 10 models may be insufficient, HumanEvalNext will also be affected by data leakage over time, the user study may not capture full tool functionality." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. While limitations mention possible extensions (more models, more languages), specific scope boundaries (e.g., 'these results apply only to Python function-level code generation benchmarks') are not stated." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The replication package at https://github.com/AISE-TUDelft/AI4SE-benchmarks includes review data, user study data, and 50% of HumanEvalNext. Full benchmark promised upon acceptance." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section III-A describes the review methodology: search platforms, keywords, selection criteria, duplicate removal, snowballing, taxonomy development. Section IV-C describes user study recruitment." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "User study participants are described: 22 people from industry (9) and academia (13), with roles (6 Researchers, 5 PhD Candidates, 5 Students, 4 Engineers, 2 Lead Researchers) and experience levels detailed." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The review pipeline documents stages from search to final benchmark list. The BenchScout pipeline (Figure 5) shows data collection through metadata enrichment, clustering, and frontend. The agentic pipeline (Figure 8) shows the benchmark improvement flow." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section IX states: 'This research was supported in part by an Amazon Research Award granted to Dr. Maliheh Izadi.'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are listed as affiliated with EEMCS faculty, Delft University of Technology, The Netherlands." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Amazon funded the research but the paper does not evaluate any Amazon products. The acknowledgment states 'The views and conclusions contained in this paper are those of the authors and do not necessarily reflect the position or policies of Amazon.'" 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper beyond the funding acknowledgment." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates 10 code LLMs on HumanEval benchmarks but does not state the training data cutoff dates for any of the models." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The paper extensively discusses data contamination: HumanEval Task 47 shows ChatGPT reproducing benchmark errors (Figure 1), suggesting contamination. The paper argues HumanEvalNext mitigates this by introducing new edge cases and modifications." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "Contamination is a central theme. The paper shows evidence of contamination (ChatGPT reproducing HumanEval errors), discusses how CodeQwen1.5's dramatic drop suggests data leakage, and positions HumanEvalNext as a mitigation strategy." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration is mentioned for the user study with 22 participants." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval is mentioned for the user study." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Demographics are detailed: roles (Researchers, PhD Candidates, Students, Engineers, Lead Researchers), experience levels (1-3, 3-5, 5+, <1 years), industry (9) vs academia (13), and AI4SE familiarity ratings." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": false, 252 "justification": "The paper states the aim to 'assess the tool's effectiveness across a diverse group of users with varying degrees of expertise' but does not specify inclusion/exclusion criteria for participant selection." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "The user study is not an experimental study with treatment/control conditions; all participants used the same tool." 258 }, 259 "blinding_described": { 260 "applies": true, 261 "answer": true, 262 "justification": "For the peer review of HumanEvalNext: 'the reviewer was not informed of specific changes made by the first author.' However, the user study itself had no blinding." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "No mention of whether all 22 participants completed the full study or if any dropped out." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "The agentic pipeline cost is reported: 'the total incurred cost from calling the apis for the models was $5.2823.' Manual effort is also quantified: 'initial creation took over 100 hours, the independent peer-review process required an additional 16 hours.'" 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Hardware is stated: 'one NVIDIA A100 80GB GPU and 32 CPU cores.' The manual effort (100+ hours for benchmark creation, 16 hours for peer review) is quantified." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Average pass@1 scores decrease by 31.22% (mean) and 26.02% (median) when evaluating 10 LLMs on HumanEvalNext compared to original HumanEval.", 286 "evidence": "Table XXIII shows per-model pass@1 scores on HumanEval vs HumanEvalNext, with drops ranging from 18.94% to 76.22%. Figure 7 shows the distribution.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "273 AI4SE benchmarks were identified from 247 studies since 2014.", 291 "evidence": "Section III-A describes the systematic review methodology with search criteria, quality assessment, and snowballing. Figure 2 shows publication trends.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "BenchScout achieved usability, effectiveness, and intuitiveness scores of 4.5, 4.0, and 4.1 out of 5 in a user study with 22 participants.", 296 "evidence": "Section IV-C reports detailed questionnaire results with demographic breakdown, Likert scale responses, and qualitative feedback.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The agentic pipeline is noninferior to the human-improved process for benchmark refinement.", 301 "evidence": "Noninferiority tests with margin δ=-0.5: mean ratings 0.16-0.53, t-statistics 12.68-25.75, p-values all 1.000, Wilcoxon tests confirming (Section VI-B).", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Certain models may have benefited from data leakage on the original HumanEval, as evidenced by CodeQwen1.5 dropping from 87.2% to 10.98%.", 306 "evidence": "Table XXIII shows the dramatic drop. The ChatGPT reproducing HumanEval Task 47 errors (Figure 1) provides indirect evidence. But no direct contamination analysis is performed.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["meta-analysis", "benchmark-eval", "case-study"], 311 "key_findings": "This paper reviews 273 AI4SE benchmarks from 247 studies, revealing a fragmented landscape dominated by code generation (34.4%) with systemic issues including poor maintenance, language specificity, and lack of peer review. BenchFrame, a peer-review-oriented methodology for improving benchmark quality, is demonstrated through HumanEvalNext, which causes an average 31.2% drop in pass@1 scores across 10 code LLMs compared to original HumanEval. An agentic pipeline for automating benchmark improvements is shown to be noninferior to manual human refinement, with API costs of only $5.28.", 312 "red_flags": [ 313 { 314 "flag": "No ablation of individual improvements", 315 "detail": "BenchFrame bundles multiple changes (corrected solutions, type annotations, more tests, edge cases, clearer descriptions) without isolating which changes cause the performance drops. The 31.2% decline could be driven primarily by increased test count (1.92x more asserts) rather than quality improvements." 316 }, 317 { 318 "flag": "Single metric evaluation", 319 "detail": "Only pass@1 is reported for all model evaluations. No other metrics (pass@5, pass@10, execution efficiency) are used, limiting the robustness of performance claims." 320 }, 321 { 322 "flag": "Weak causal claim for data leakage", 323 "detail": "CodeQwen1.5's dramatic 76.22% drop is attributed to data leakage without direct evidence of contamination. Alternative explanations (e.g., the model being particularly sensitive to edge cases or type annotations) are not explored." 324 }, 325 { 326 "flag": "Missing hyperparameters for model evaluations", 327 "detail": "Temperature, top-p, and other sampling parameters are not reported for the 10 model evaluations, making reproduction difficult and potentially affecting pass@1 results significantly." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Evaluating large language models trained on code", 333 "authors": ["M. Chen"], 334 "year": 2021, 335 "arxiv_id": "2107.03374", 336 "relevance": "Original HumanEval benchmark paper, foundational to code generation evaluation." 337 }, 338 { 339 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 340 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 341 "year": 2023, 342 "arxiv_id": "2305.01210", 343 "relevance": "HumanEvalPlus/EvalPlus benchmark that augments HumanEval test suites, directly compared in this study." 344 }, 345 { 346 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 347 "authors": ["N. Jain"], 348 "year": 2024, 349 "arxiv_id": "2403.07974", 350 "relevance": "Dynamic benchmark approach addressing contamination, positioned as complementary to BenchFrame's repair-and-refine approach." 351 }, 352 { 353 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 354 "authors": ["C. E. Jimenez"], 355 "year": 2024, 356 "relevance": "Major real-world SE benchmark for evaluating LLM agents on GitHub issue resolution." 357 }, 358 { 359 "title": "BigCodeBench", 360 "authors": ["T. Y. Zhuo"], 361 "year": 2024, 362 "relevance": "Large-scale code generation benchmark with 1,140 Python problems, representative of class-level evaluation." 363 }, 364 { 365 "title": "AgentBench: Evaluating LLMs as Agents", 366 "authors": ["X. Liu"], 367 "year": 2023, 368 "relevance": "Benchmark for evaluating LLMs as agents across multiple environments." 369 }, 370 { 371 "title": "Program synthesis with large language models", 372 "authors": ["J. Austin"], 373 "year": 2021, 374 "arxiv_id": "2108.07732", 375 "relevance": "MBPP benchmark paper, one of the two foundational benchmarks improved by BenchFrame in this study." 376 }, 377 { 378 "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM", 379 "authors": ["C. S. Xia", "Y. Deng", "L. Zhang"], 380 "year": 2024, 381 "arxiv_id": "2403.19114", 382 "relevance": "Addresses benchmark evolution and leaderboard gaming in code generation evaluation." 383 }, 384 { 385 "title": "CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings", 386 "authors": ["S. Quan"], 387 "year": 2025, 388 "arxiv_id": "2501.01257", 389 "relevance": "Competition-level code generation benchmark with Elo-based rating system." 390 }, 391 { 392 "title": "CodeRAG-Bench", 393 "authors": ["Z. Wang"], 394 "year": 2024, 395 "relevance": "Benchmark for retrieval-augmented code generation evaluation." 396 } 397 ] 398 }