scan.json (26819B)
1 { 2 "paper": { 3 "title": "TestBench: Evaluating Class-Level Test Case Generation Capability of Large Language Models", 4 "authors": [ 5 "Quanjun Zhang", 6 "Ye Shang", 7 "Chunrong Fang", 8 "Siqi Gu", 9 "Jianyi Zhou", 10 "Zhenyu Chen" 11 ], 12 "year": 2024, 13 "venue": "arXiv", 14 "arxiv_id": "2409.17561", 15 "doi": "10.48550/arXiv.2409.17561" 16 }, 17 "scan_version": 2, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "TestBench evaluates CodeLlama-13b, GPT-3.5, and GPT-4 on class-level Java test generation across 108 functions from 9 open-source projects. Larger models produce fewer syntax and compilation errors, with GPT-4 achieving 92.51% line coverage and 26.10% mutation kill rate on passing tests. Providing class context improves compilation pass rates, but only GPT-4 benefits from full context—smaller models regress due to noise. A heuristic repair strategy reduces GPT-3.5's syntax error rate from 97.84% to 4.38%.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper states 'All of the datasets and source code are in the repository, https://github.com/iSEngLab/TestBench' in the Data Availability section." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The benchmark dataset of 108 Java programs is released at the same GitHub repository. 'All specific versions of the projects are publicly available in our repository.'" 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions Java 17, Maven 3.9, and JUnit 5.0 but provides no requirements.txt, Dockerfile, or detailed environment setup section listing all dependency versions." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. The repo URL is given but no README contents or reproduction guide is described." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results are reported as point estimates (percentages and counts) with no confidence intervals or error bars despite generating 10 test cases per function." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims GPT-4 'significantly outperformed the others' but uses no statistical significance tests. All comparisons are based on raw number differences." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Results are reported with baseline context, e.g., 'syntax error rates reduce from 38.12% to 20.25%' and 'compilation pass rates increase from 14.63% to 23.83%', providing from/to context for effect magnitude." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification for why 108 functions is sufficient, no power analysis. Section 6 acknowledges the limited number of functions but frames it as a practical constraint rather than justifying adequacy." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Ten test cases are generated per function 'to minimize errors caused by incidental factors' but no variance, standard deviation, or spread measures are reported across these runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Three models (CodeLlama-13b, GPT-3.5, GPT-4) are compared against each other across all metrics. However, no comparison with traditional test generation tools (EvoSuite, Randoop) is included." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "CodeLlama-13B-Instruct, GPT-3.5-turbo-1106, and GPT-4-1106-preview were contemporary models at publication time (September 2024)." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Three context levels (self-contained, simple, full) serve as an ablation of context information (RQ2). The repair strategy is also evaluated separately (RQ3)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Five evaluation metrics are used: syntactic correctness, compilation correctness, execution correctness, line coverage, and mutation kill rate (Section 4.3)." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation of generated test cases is performed. All evaluation is automated (static analysis, compilation, execution, JaCoCo coverage, PITest mutation testing)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The 108-function benchmark is used purely for evaluation. No fine-tuning or prompt optimization is performed on this data — models are used off-the-shelf with pre-designed prompts." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Figure 7 provides per-project breakdowns of test results for all 9 projects across all three models, with exact counts for each error type." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Failure patterns are discussed extensively: textual descriptions instead of code (Figure 8), template-like content (Figure 9), compilation error causes (Table 2), and meaningless test cases (Section 5.1.4)." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that full context hurts CodeLlama performance (compilation pass rate decreases from 29.91% to 23.06%) and that mutation kill rates are generally low across all models." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims about model scale, context impact, and repair strategy are all supported by experimental results in Sections 5.1-5.3." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper claims 'with an increase in LLMs' parameter size, the number of syntax errors and compilation errors decreases' and attributes this to model scale. However, the three models differ in architecture, training data, and training procedures — model size is confounded with these factors." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims to evaluate 'Class-Level Test Case Generation Capability of Large Language Models' broadly, but only 3 models are tested on Java only, with 108 functions from 9 projects. No qualification of language or model scope in the title." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "Section 6 discusses data leakage and benchmark size but does not consider alternative explanations for the findings, such as architecture differences vs. model size, training data composition, or instruction-tuning quality." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures specific metrics (syntax correctness, compilation correctness, coverage, mutation kill rate) and claims about those specific metrics without overframing them as broader capabilities." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specific model versions are stated: 'CodeLlama-13B-Instruct, GPT-3.5-turbo-1106, and GPT-4-1106-preview' (Section 4.2)." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full prompt templates are shown in Figures 4 and 5, including system prompt, instruction variants, and the test_info framework. The fill values are deterministic from the released benchmark data." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported for any model. Only a 10-minute timeout for CodeLlama is mentioned." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. Models are prompted directly without retry loops, tool use, or feedback mechanisms." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 3.1 describes the three-step selection process: crawling Java repos with 1000+ stars, filtering by Maven/JUnit/size, selecting projects for topic diversity, and manually selecting high-quality functions." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 'Threats to Validity' discusses two specific threats: data leakage and benchmark size." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Threats are specific to this study: data leakage from open-source projects potentially in training data (mitigated by TestBench-HumanEval comparison), and the limited number of 108 functions constrained by Maven compilation requirements." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges limited function count but doesn't state boundaries like 'results do not apply to other languages' or 'findings are limited to these three specific models.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "The GitHub repository contains the dataset and source code. 'All of the datasets and source code are in the repository.'" 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 3.1 describes the three-step process: crawling repos (1000+ stars, Maven/JUnit, 10-100MB), selecting 20 projects for diversity, extracting public functions, and manually selecting 108 from 9 projects." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Project selection criteria are documented: Java repos with 1000+ stars, Maven and JUnit frameworks, 10-100MB size, diverse topics. Function selection criteria: public, non-test, non-interface, non-abstract, non-deprecated, frequently appearing in real-world scenarios." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented with counts: 99 candidate projects → 20 selected → functions extracted → manually filtered → 108 functions from 9 projects. Figure 1 shows the construction process." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Funding section states: 'This work is supported partially by the National Natural Science Foundation of China (61932012, 62372228).'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are listed: Nanjing University (State Key Laboratory for Novel Software Technology) and Huawei Cloud Computing Technologies Co., Ltd." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "The National Natural Science Foundation of China is a government funding agency with no financial stake in the evaluation outcomes of specific LLMs." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is present. One author is affiliated with Huawei, but no disclosure of potential conflicts related to this affiliation is made." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the three models. The paper does not mention when model training data was collected." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": true, 242 "justification": "Section 6 explicitly discusses this: 'TestBench are sourced from open-source projects on GitHub. Thus, there may be an overlap between TestBench and the training data of LLMs.' They build TestBench-HumanEval to assess consistency." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 6 acknowledges that benchmark projects may overlap with training data and constructs TestBench-HumanEval as a secondary validation, showing consistent results with CodeLlama." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. It is a benchmark evaluation of LLM-generated test cases." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No API costs, token counts, or wall-clock time reported. Only a 10-minute timeout for CodeLlama is mentioned, with no cost information for GPT-3.5 or GPT-4 API usage." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The paper mentions 'a local Linux server' but provides no GPU hours, hardware specifications, or total API spend for the experiments." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "Ten test cases are generated per function but no seed sensitivity analysis or variance across these generations is reported. Results are aggregated without spread measures." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": true, 308 "justification": "Section 4.4 states: 'For each generation task, we generate 10 test cases to minimize errors caused by incidental factors.'" 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "No hyperparameter search is described. Models are used with unspecified default settings. No mention of trying different temperatures or sampling parameters." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "All three context configurations are reported rather than selecting the best. No cherry-picking of configurations — results for all prompt types and all models are shown." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "No statistical tests are performed at all, so no multiple comparison corrections. Multiple comparisons are made across 3 models × 3 contexts × 5 metrics without any statistical framework." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors create the benchmark and design the prompts, then evaluate models on their own benchmark. No discussion of potential author-evaluation bias." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Three models of vastly different sizes (13B, ~100B, ~1T) are compared without discussing compute cost differences. No performance-per-compute analysis." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper argues for class-level testing over standalone functions (vs TestEval) but does not question whether its five metrics actually capture test case quality comprehensively, or discuss construct validity of the benchmark." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is used. Models are prompted directly without agentic frameworks." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "The paper does not discuss when the benchmark projects were created relative to model training cutoffs. No temporal analysis of potential leakage." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether providing class context (full context prompt) leaks information that wouldn't be available in a real development scenario beyond what is intentionally designed." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "Multiple functions are drawn from the same 9 projects (e.g., 35 from Java-Algorithm) but non-independence between functions within the same project is not discussed." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection method is used. The TestBench-HumanEval consistency check is mentioned but no canary strings, membership inference, or decontamination pipelines are applied." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Larger models produce fewer syntax and compilation errors in generated test cases.", 372 "evidence": "Figure 6: syntax errors drop from 20.2% (CodeLlama) to 4.4% (GPT-3.5) to 2.2% (GPT-4). Compilation errors drop from 55.9% to 50.7% to 47.9% (Section 5.1).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "GPT-4 significantly outperforms other models in code coverage and mutation kill rate on passing test cases.", 377 "evidence": "Table 4: GPT-4 achieves 92.51% line coverage and 26.10% mutation kill rate vs GPT-3.5's 71.07%/17.03% and CodeLlama's 76.43%/21.73% (Section 5.1.4).", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Providing context improves compilation pass rates, but only larger models benefit from richer (full) context.", 382 "evidence": "Figure 10: GPT-4 compilation pass rates increase progressively (34.81% → 52.96% → 61.94%) across contexts, while CodeLlama regresses from 29.91% to 23.06% with full context (Section 5.2).", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "The heuristic repair strategy significantly reduces syntax errors in generated test cases.", 387 "evidence": "Figure 11: GPT-3.5 syntax error rate drops from 97.84% to 4.38%; CodeLlama from 38.12% to 20.25%; GPT-4 from 2.35% to 2.16% (Section 5.3).", 388 "supported": "strong" 389 }, 390 { 391 "claim": "LLMs' ability to detect defects through generated test cases is somewhat limited.", 392 "evidence": "Table 4: Mutation kill rates are low across all models (21.73%, 17.03%, 26.10%) despite relatively high line coverage, indicating test assertions are often weak (Section 5.1.4).", 393 "supported": "strong" 394 } 395 ], 396 "red_flags": [ 397 { 398 "flag": "No statistical tests despite comparative claims", 399 "detail": "The paper claims GPT-4 'significantly outperformed' others and makes multiple comparative claims across models and contexts, but uses no statistical significance tests. All conclusions are based on raw percentage comparisons." 400 }, 401 { 402 "flag": "No variance reported across 10 generations", 403 "detail": "Ten test cases are generated per function to 'minimize errors caused by incidental factors' but no variance, standard deviation, or confidence intervals are reported. The reader cannot assess result stability." 404 }, 405 { 406 "flag": "Missing hyperparameters", 407 "detail": "No temperature, top-p, or max_tokens settings are reported for any model. These significantly affect generation quality and make reproduction impossible." 408 }, 409 { 410 "flag": "Confounded model size claims", 411 "detail": "The paper attributes performance differences to model 'parameter size' but the three models differ in architecture, training data, instruction-tuning methods, and more. Model size is confounded with these factors." 412 }, 413 { 414 "flag": "No comparison with traditional test generation tools", 415 "detail": "EvoSuite and Randoop are discussed in related work but never compared against. This omits the most relevant baselines for test generation quality." 416 }, 417 { 418 "flag": "Manual function selection bias", 419 "detail": "Functions are 'manually selected' based on whether they 'frequently appear in real-world development scenarios.' This subjective criterion could bias the benchmark toward functions that favor LLM generation." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Evaluating Large Language Models Trained on Code", 425 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 426 "year": 2021, 427 "arxiv_id": "2107.03374", 428 "relevance": "Introduced HumanEval benchmark for evaluating LLM code generation capabilities." 429 }, 430 { 431 "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?", 432 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 433 "year": 2024, 434 "arxiv_id": "2310.06770", 435 "relevance": "Benchmark for evaluating LLMs on real-world GitHub issue resolution, relevant to code generation evaluation methodology." 436 }, 437 { 438 "title": "Chatunitest: A framework for llm-based test generation", 439 "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi"], 440 "year": 2024, 441 "relevance": "LLM-based test generation framework directly relevant to the test generation evaluation domain." 442 }, 443 { 444 "title": "TESTEVAL: Benchmarking Large Language Models for Test Case Generation", 445 "authors": ["Wenhan Wang", "Chenyuan Yang", "Zhijie Wang"], 446 "year": 2024, 447 "arxiv_id": "2406.04531", 448 "relevance": "Most closely related work — a test generation benchmark using LeetCode programs, which TestBench contrasts against." 449 }, 450 { 451 "title": "EvoSuite: Automatic Test Suite Generation for Object-Oriented Software", 452 "authors": ["Gordon Fraser", "Andrea Arcuri"], 453 "year": 2011, 454 "doi": "10.1145/2025113.2025179", 455 "relevance": "Foundational search-based test generation tool, baseline for automated testing approaches." 456 }, 457 { 458 "title": "StarCoder: May the Source Be with You!", 459 "authors": ["Raymond Li", "Loubna Ben Allal"], 460 "year": 2023, 461 "arxiv_id": "2305.06161", 462 "relevance": "Open-source code LLM relevant to understanding code generation model capabilities." 463 }, 464 { 465 "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-Trained Models", 466 "authors": ["Hao Yu", "Bo Shen", "Dezhi Ran"], 467 "year": 2024, 468 "doi": "10.1145/3597503.3623316", 469 "relevance": "Benchmark for pragmatic code generation evaluating non-standalone functions, methodologically related." 470 }, 471 { 472 "title": "Magicoder: Empowering Code Generation with OSS-Instruct", 473 "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"], 474 "year": 2024, 475 "arxiv_id": "2312.02120", 476 "relevance": "Code generation model training methodology relevant to LLM-based software engineering." 477 }, 478 { 479 "title": "TestART: Improving LLM-based Unit Test via Co-evolution of Automated Generation and Repair Iteration", 480 "authors": ["Siqi Gu", "Chunrong Fang", "Quanjun Zhang"], 481 "year": 2024, 482 "relevance": "LLM-based test generation and repair approach from the same research group, directly related." 483 }, 484 { 485 "title": "Large-Scale, Independent and Comprehensive Study of the Power of LLMs for Test Case Generation", 486 "authors": ["Wendkûuni C. Ouédraogo", "Kader Kaboré"], 487 "year": 2024, 488 "arxiv_id": "2407.00225", 489 "relevance": "Large-scale LLM test generation evaluation study, directly relevant to benchmarking test generation capabilities." 490 }, 491 { 492 "title": "Evaluating and improving chatgpt for unit test generation", 493 "authors": ["Zhiqiang Yuan", "Mingwei Liu", "Shiji Ding"], 494 "year": 2024, 495 "relevance": "Evaluates ChatGPT for unit test generation with improvement strategies, directly relevant to LLM test generation evaluation." 496 }, 497 { 498 "title": "Software Testing With Large Language Models: Survey, Landscape, and Vision", 499 "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen"], 500 "year": 2024, 501 "relevance": "Survey of LLM-based software testing covering the broader landscape this benchmark contributes to." 502 } 503 ] 504 }