scan.json (25991B)
1 { 2 "paper": { 3 "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation", 4 "authors": [ 5 "Max Schäfer", 6 "Sarah Nadi", 7 "Aryaz Eghbali", 8 "Frank Tip" 9 ], 10 "year": 2023, 11 "venue": "IEEE Transactions on Software Engineering (arXiv preprint)", 12 "arxiv_id": "2302.06527" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states 'our approach in TESTPILOT, an LLM-based test generation tool for JavaScript that...is available as open-source software at https://github.com/githubnext/testpilot' (Introduction, contributions bullet)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states 'The raw data and analysis for all our experiments can be found at https://doi.org/10.6084/m9.figshare.23653371' (Section 2 introduction)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions JavaScript, Node.js, Mocha, Istanbul/nyc, and a GitHub Actions Linux VM (2-core CPU, 7GB RAM) but does not provide a requirements file, Dockerfile, or detailed dependency specifications sufficient to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While data and code are released via figshare and GitHub, the paper itself does not include step-by-step reproduction instructions. The artifact link points to external data but no explicit reproduction guide is included in the paper." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper reports median values across 10 runs but does not report confidence intervals or error bars for main coverage results. Coverage figures in Table 2 are single-value medians." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper uses Wilcoxon paired rank-sum tests to compare TESTPILOT vs. Nessie coverage (Section 4.2, p-values 0.002 and 0.027), and Wilcoxon matched-pairs signed rank tests for ablation comparisons (Section 4.5)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports Cliff's delta effect sizes for all major comparisons (e.g., 'large effect size, measured by Cliff's delta [55], of 0.493 for statement coverage and a medium one (0.431) for branch coverage' in Section 4.2, and multiple effect sizes in Section 4.5)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper evaluates 25 npm packages but does not justify why 25 packages were chosen or perform a power analysis. The selection criteria are described but not statistically justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper explicitly states 'we run all experiments 10 times' and reports medians, but does not report standard deviation, IQR, or any spread measure alongside the coverage medians in the main tables." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares TESTPILOT against Nessie, 'the state-of-the-art JavaScript test generator' (RQ2, Section 4.2), and also compares against three different LLMs (RQ7, Section 4.7)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Nessie was published at ICSE 2022 and is described as the current state-of-the-art feedback-directed random test generation technique for JavaScript; the comparison is contemporaneous with the 2023 paper." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "RQ5 (Section 4.5) presents an ablation study disabling each of the four prompt refiners (FnBodyIncluder, DocCommentIncluder, SnippetIncluder, RetryWithError) one at a time, with statistical significance testing on the differences." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports statement coverage, branch coverage, percentage of passing tests, non-trivial assertions, and test similarity (maximum edit distance) as evaluation metrics across RQ1-RQ6." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper makes explicit claims about test readability and naturalness (e.g., 'tests look quite natural and similar to tests that a human developer might write' in Section 2.3, and readability comparisons with Nessie throughout). These readability/naturalness claims make human evaluation relevant, not 'clearly irrelevant to the claims' as the schema requires for applies=false. The paper itself acknowledges this: 'we do not formally assess the readability of these tests. In the future, it would be interesting to conduct user studies to assess the readability' (Section 5, Construct Validity). Since the paper could reasonably be expected to include human evaluation of test quality given its readability claims, applies=true with answer=false is correct." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The 25 npm packages are clearly specified as the evaluation benchmark (Table 1), and there is no tuning performed on these packages; TESTPILOT generates tests at inference time without using any test examples from the packages as training data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 provides per-package coverage results for all 25 npm packages, and Table 4 provides per-package LLM comparisons. Results are not just aggregate medians." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "RQ4 (Section 4.4) analyzes failing tests by type: assertion errors, file-system errors, correctness errors, timeout errors, and other errors. Figure 7 shows the breakdown per package." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The ablation study (RQ5) shows cases where TESTPILOT performs worse (e.g., Figure 9 shows function body inclusion confusing the model). Section 4.5 notes '394 cases (7.3%) the refined prompt was less effective than the original prompt.'" 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "All major abstract claims are supported by results: median 70.2% statement coverage (Table 2), 52.8% branch coverage (Table 2), Nessie comparison (Section 4.2), ablation findings (Section 4.5), and LLM comparison (Section 4.7) are all present in the results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The ablation study (RQ5) makes causal claims that each prompt component contributes to effectiveness; this is supported by controlled single-variable ablations with statistical significance tests (Wilcoxon matched-pairs signed rank tests with effect sizes in Section 4.5)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly bounds its claims: 'while our technique is conceptually language-agnostic, our current implementation of TESTPILOT targets JavaScript, and thus we cannot generalize our results to other languages' (Section 5, External Validity)." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 5 (Threats to Validity) discusses specific alternative explanations: internal validity threat from snippet matching heuristics, construct validity threat from non-trivial assertion definition, and external validity threat that performance may not generalize to proprietary code or other languages." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper specifies exact model versions: 'gpt-3.5-turbo-0301' (Section 3.2), 'code-cushman-002' (referenced as OpenAI's older Codex model), and StarCoder (with HuggingFace reference [49]). The version gpt-3.5-turbo-0301 is a specific snapshot." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": true, 142 "justification": "Figures 1, 3, and 4 show the actual prompt templates and example prompts used in experiments, including the full prompt structure and real examples. The structure is shown in Figure 1 and concrete populated examples appear in Figures 3 and 4." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 3.2 states: 'sampling five completions of up to 100 tokens at temperature zero, with all other options at their default values.' StarCoder uses temperature 0.01. Key hyperparameters are reported." 148 }, 149 "scaffolding_described": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 2 and Algorithms 1-2 describe TESTPILOT's architecture in detail: API Explorer, Documentation Miner, Prompt Generator, Test Validator, and Prompt Refiner components with their logic, including the retry-with-error feedback loop." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "The documentation mining process is described in Section 2.1 (Documentation Miner), and the API exploration procedure is formalized in Algorithm 1. The paper describes how snippets are matched to functions via textual containment." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5 'THREATS TO VALIDITY' contains dedicated subsections on Internal Validity, Construct Validity, and External Validity with substantive discussion." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": true, 169 "justification": "Threats are specific to this study: the snippet matching heuristic for same-name functions (internal validity), the low bar for non-trivial assertion definition and limitations of backwards slicing in JavaScript (construct validity), and dependence on 25 npm packages and potential non-generalizability to proprietary code (external validity)." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper explicitly states 'we cannot generalize our results to other languages' and notes performance may not generalize to 'proprietary code that was never seen in the LLM's training set' (Section 5, External Validity)." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": true, 181 "justification": "Raw data is available at https://doi.org/10.6084/m9.figshare.23653371, explicitly stated in the paper: 'The raw data and analysis for all our experiments can be found at' this URL." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 3.2 describes how the 25 npm packages were selected (first 10 from Nessie benchmark, 10 new GitHub-hosted packages from different domains, 5 GitLab-hosted packages). Selection criteria (popularity, domain diversity, documentation availability) are stated." 187 }, 188 "recruitment_methods_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "This paper has no human participants. Data source is publicly available npm packages on GitHub and GitLab, not human subjects." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "The test generation pipeline is documented through Algorithms 1 and 2, and the architecture description in Section 2. The deduplication of generated tests is also described (stripping comments, comparing normalized forms)." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "The Acknowledgment section states funding sources: NSF grants CCF-1907727 and CCF-2307742 (F. Tip) and Canada Research Chairs Program and NSERC RGPIN-2017-04289 (S. Nadi)." 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Author affiliations are disclosed on the first page: M. Schäfer is with GitHub, UK; S. Nadi with University of Alberta; A. Eghbali with University of Stuttgart; F. Tip with Northeastern University. The acknowledgment also notes S. Nadi and F. Tip were sabbatical visitors at GitHub during this work." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": false, 213 "justification": "M. Schäfer is employed by GitHub, and the acknowledgment notes the other authors conducted this research while at GitHub. The paper evaluates a GitHub-hosted tool (TESTPILOT) and compares against competing approaches. GitHub has a financial interest in demonstrating LLM-based test generation effectiveness (related to GitHub Copilot)." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "There is no competing interests statement or declaration of financial interests (patents, equity). The acknowledgment mentions GitHub affiliation but does not formally declare whether any authors hold financial interests in the outcomes." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper acknowledges that 'gpt3.5-turbo...was trained on GitHub repositories' and uses this as motivation for including GitLab packages, but does not state a specific training data cutoff date for any of the three LLMs evaluated." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": true, 229 "answer": true, 230 "justification": "RQ6 (Section 4.6) is entirely dedicated to investigating memorization by measuring maximum similarity (normalized edit distance) between generated tests and existing tests. The paper explicitly addresses whether TESTPILOT reproduces memorized tests." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": true, 234 "answer": true, 235 "justification": "The paper acknowledges 'since gpt3.5-turbo...was trained on GitHub repositories, we have to assume that all our subject packages...were part of the model's training set' (Section 3.2), and mitigates by including 5 GitLab packages not in GitHub training data." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study; it evaluates automated test generation on npm packages." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "demographics_reported": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "attrition_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Section 4.7 reports wall-clock time: 'median time for TESTPILOT to generate tests for a given function using gpt3.5-turbo is 15s, and the median time to generate a complete test suite for a given package is 6m 55s.' Similar figures are given for StarCoder (24s/function, 10m 48s/package) and code-cushman-002 (11s/function, 4m 53s/package). API cost in dollars is not reported." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper mentions tests were run on 'a standard GitHub Actions Linux VM with a 2-core CPU, 7GB of RAM, and 14GB of SSD disk space' (footnote 12) but does not state total API spend or GPU hours consumed for the experiments." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "TESTPILOT achieves a median statement coverage of 70.2% and branch coverage of 52.8% on 25 npm packages using gpt3.5-turbo.", 291 "evidence": "Table 2, Section 4.1 (RQ1). Results are medians over 10 runs across 25 packages. Per-package results range from 33.9% to 93.1% statement coverage.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "TESTPILOT significantly outperforms Nessie (state-of-the-art JavaScript test generator) in both statement coverage (70.2% vs. 51.3%) and branch coverage (52.8% vs. 25.6%).", 296 "evidence": "Table 2, Section 4.2 (RQ2). Wilcoxon paired rank-sum tests: p=0.002 (statement) and p=0.027 (branch). Cliff's delta effect sizes: 0.493 (large) and 0.431 (medium). TESTPILOT outperforms Nessie on 17 of 24 comparable packages.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "All four prompt refiners contribute to generating more effective tests; disabling any refiner results in statistically significant performance degradation.", 301 "evidence": "Section 4.5 (RQ5), Figure 8. Wilcoxon matched-pairs signed rank tests show statistically significant differences between full configuration and each ablated configuration. Largest effect: Cliff's delta 0.582 for FnBodyIncluder, 0.531 for DocCommentIncluder on passing tests.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "92.8% of TESTPILOT's generated tests have at most 50% similarity to existing tests (measured by normalized edit distance), with none being exact copies.", 306 "evidence": "Section 4.6 (RQ6), Figure 10. Maximum similarity measured using normalized Levenshtein distance against all existing tests in each package.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "The approach generalizes across LLMs; code-cushman-002 achieves similar results to gpt3.5-turbo (68.2% median statement coverage), while StarCoder achieves lower but still competitive results (54.0% median statement coverage).", 311 "evidence": "Table 4, Section 4.7 (RQ7). Wilcoxon tests show no significant difference between gpt3.5-turbo and code-cushman-002, but significant differences between StarCoder and both OpenAI models (p<0.05). Even StarCoder exceeds Nessie's 51.3%.", 312 "supported": "strong" 313 } 314 ], 315 "methodology_tags": [ 316 "benchmark-eval" 317 ], 318 "key_findings": "TESTPILOT, an adaptive LLM-based test generation tool for JavaScript, achieves a median 70.2% statement coverage and 52.8% branch coverage on 25 npm packages using gpt3.5-turbo, significantly outperforming the state-of-the-art Nessie tool (51.3% and 25.6% respectively) with statistical significance. Ablation studies show all four prompt refiners contribute meaningfully to test effectiveness, and contamination analysis finds that 92.8% of generated tests have at most 50% similarity to existing tests with no exact copies. The approach generalizes across multiple LLMs (gpt3.5-turbo, code-cushman-002, StarCoder), with even the open-source StarCoder model matching or exceeding Nessie's coverage.", 319 "red_flags": [ 320 { 321 "flag": "GitHub affiliation conflict", 322 "detail": "The lead author (M. Schäfer) is employed by GitHub, the other authors conducted the research as sabbatical visitors or interns at GitHub, and the tool (TESTPILOT) is hosted on GitHub Next. GitHub has a commercial interest in demonstrating LLM-based code tooling effectiveness (related to GitHub Copilot). No competing interests statement is provided despite this non-trivial affiliation conflict." 323 }, 324 { 325 "flag": "No API cost reported", 326 "detail": "The evaluation queries gpt3.5-turbo, code-cushman-002, and StarCoder across 1,684 API functions on 25 packages, repeated 10 times each. No dollar cost for API calls is reported, making it difficult to assess the practical budget required to reproduce this work." 327 }, 328 { 329 "flag": "Variance not reported for main results", 330 "detail": "While the paper runs experiments 10 times and reports medians, no spread measure (std dev, IQR, min/max) is reported alongside the coverage medians in Tables 2-4, making it impossible to assess result stability across runs." 331 }, 332 { 333 "flag": "Training cutoff not stated", 334 "detail": "The paper does not state the training data cutoff for gpt3.5-turbo, code-cushman-002, or StarCoder, making it difficult to fully assess whether the benchmark packages were in training data. The GitLab mitigation partially addresses this for GitHub-hosted packages." 335 } 336 ], 337 "cited_papers": [ 338 { 339 "title": "Nessie: Automatically testing javascript APIs with asynchronous callbacks", 340 "authors": [ 341 "E. Arteca", 342 "S. Harner", 343 "M. Pradel", 344 "F. Tip" 345 ], 346 "year": 2022, 347 "doi": "10.1145/3510003.3510106", 348 "relevance": "State-of-the-art baseline JavaScript test generation tool used as primary comparison in the evaluation." 349 }, 350 { 351 "title": "CodaMOSA: Escaping coverage plateaus in test generation with pre-trained large language models", 352 "authors": [ 353 "C. Lemieux", 354 "J. P. Inala", 355 "S. K. Lahiri", 356 "S. Sen" 357 ], 358 "year": 2023, 359 "relevance": "Related work on using LLMs to assist search-based test generation for Python; directly compared to TESTPILOT in discussion." 360 }, 361 { 362 "title": "Code Generation Tools (Almost) for Free? A Study of Few-Shot, Pre-Trained Language Models on Code", 363 "authors": [ 364 "P. Bareiß", 365 "B. Souza", 366 "M. d'Amorim", 367 "M. Pradel" 368 ], 369 "year": 2022, 370 "arxiv_id": "2206.01335", 371 "relevance": "Prior work on few-shot LLM test generation for Java, the closest direct predecessor to TESTPILOT." 372 }, 373 { 374 "title": "Unit test case generation with transformers and focal context", 375 "authors": [ 376 "M. Tufano", 377 "D. Drain", 378 "A. Svyatkovskiy", 379 "S. K. Deng", 380 "N. Sundaresan" 381 ], 382 "year": 2021, 383 "relevance": "Prior work (AthenaTest) on fine-tuned BART model for test generation, compared against in the related work section." 384 }, 385 { 386 "title": "Evaluating Large Language Models Trained on Code", 387 "authors": [ 388 "M. Chen", 389 "J. Tworek", 390 "H. Jun" 391 ], 392 "year": 2021, 393 "arxiv_id": "2107.03374", 394 "relevance": "OpenAI Codex paper — the foundational LLM for code that motivated the approach." 395 }, 396 { 397 "title": "Starcoder: A state-of-the-art LLM for code", 398 "authors": [ 399 "HuggingFace" 400 ], 401 "year": 2023, 402 "relevance": "One of the three LLMs evaluated in RQ7; open-source with documented training process." 403 }, 404 { 405 "title": "CodeT: Code Generation with Generated Tests", 406 "authors": [ 407 "B. Chen", 408 "F. Zhang", 409 "A. Nguyen" 410 ], 411 "year": 2022, 412 "arxiv_id": "2207.10397", 413 "relevance": "Related LLM approach for code generation + test generation from problem descriptions." 414 }, 415 { 416 "title": "On Learning Meaningful Assert Statements for Unit Test Cases", 417 "authors": [ 418 "C. Watson", 419 "M. Tufano", 420 "K. Moran", 421 "G. Bavota", 422 "D. Poshyvanyk" 423 ], 424 "year": 2020, 425 "doi": "10.1145/3377811.3380429", 426 "relevance": "ATLAS: prior work using LLMs to generate assert statements for existing test cases." 427 }, 428 { 429 "title": "TOGA: A neural method for test oracle generation", 430 "authors": [ 431 "E. Dinella", 432 "G. Ryan", 433 "T. Mytkowicz", 434 "S. K. Lahiri" 435 ], 436 "year": 2022, 437 "doi": "10.1145/3510003.3510141", 438 "relevance": "Neural approach for test oracle generation compared and discussed in related work." 439 }, 440 { 441 "title": "Neural software analysis", 442 "authors": [ 443 "M. Pradel", 444 "S. Chandra" 445 ], 446 "year": 2022, 447 "doi": "10.1145/3460348", 448 "relevance": "Survey of neural techniques for software engineering tasks; directly cited as framing for the field." 449 }, 450 { 451 "title": "Test generation for higher-order functions in dynamic languages", 452 "authors": [ 453 "M. Selakovic", 454 "M. Pradel", 455 "R. Karim", 456 "F. Tip" 457 ], 458 "year": 2018, 459 "doi": "10.1145/3276531", 460 "relevance": "Prior JavaScript test generation work (LambdaTester) by overlapping authors, context for dynamic API exploration." 461 } 462 ] 463 }