scan.json (26523B)
1 { 2 "paper": { 3 "title": "TestGenEval: A Real World Unit Test Generation and Test Completion Benchmark", 4 "authors": ["Kush Jain", "Gabriel Synnaeve", "Baptiste Rozière"], 5 "year": 2024, 6 "venue": "ICLR 2025", 7 "arxiv_id": "2410.00752", 8 "doi": "10.48550/arXiv.2410.00752" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "TestGenEval is a large-scale benchmark of 68,647 tests from 1,210 code-test file pairs across 11 Python repositories for evaluating LLM test generation and completion. The best model (GPT-4o) achieves only 35.2% coverage and 18.8% mutation score on full test suite generation, showing models struggle with real-world test generation. Test completion is easier than full generation, but models add virtually no new coverage when completing the last test in an existing suite. Models frequently make assertion errors and struggle to reason about code execution.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides code and docker images at https://figshare.com/s/51171ae97cd21d233d4f, with detailed instructions on running and extending the benchmark (Section 9, Reproducibility Statement)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmark data (1,210 code-test file pairs) is released via the figshare link, along with docker images for all 11 repositories and model generations." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Docker images are provided for each repository version with all dependencies installed, including coverage and mutation testing tools. This exceeds a requirements.txt." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section 9 states 'We release code to run and extend TESTGENEVAL along with all docker images' and 'detailed instructions on how to run our benchmark, and even extend it.'" 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Appendix G reports 95% confidence intervals for all settings (Table 12, Table 17) and performs pairwise statistical comparisons." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Appendix G is dedicated to statistical tests, including pairwise comparisons and Elo ratings for model ranking across all settings." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results are reported with absolute percentages and differences between models (e.g., GPT-4o at 35.2% coverage vs Llama 405B at 35.0%), along with win rates and Elo scores providing magnitude context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 1,210 file pairs was the target size, nor power analysis. The benchmark size is determined by what was extractable from SWEBench rather than by statistical power considerations." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Main results in Tables 2-3 are single-run at temperature=0.2. While pass@5 uses temperature=0.8 with 5 samples, no standard deviation or variance across independent runs is reported for the main results." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper evaluates 10 models across 4 size categories (small, medium, large, flagship), serving as baselines for each other. Table 1 compares TestGenEval against prior benchmarks." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models evaluated include GPT-4o, Llama 3.1 (8B/70B/405B), Codestral 22B, DeepSeekCoder V2, and Gemma 2 — all contemporary at time of writing (2024)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper ablates context window size (Section 4.1.3, Figure 8), number of samples (Section 4.1.2, Figure 7), and provides TestGenEvalLite as a representative subset (Appendix E.2)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: pass@1, pass@5, all pass@1, any pass@1, coverage, coverage@pass, mutation score, mutation score@pass, coverage improvement, and Elo ratings." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper includes qualitative analysis with manual examination of model outputs (Section 4.2, Appendix F.6-F.8), including detailed examples of correct and incorrect test generation." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "TestGenEvalLite (160 pairs) is a separate representative subset from the full TestGenEval (1,210 pairs), and results are reported on both. The benchmark itself uses execution-based evaluation with no tuning on the test data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by model size category (small/medium/large/flagship), by task type (generation vs completion), by completion setting (first/last/extra), and by repository (Table 4)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive error analysis in Appendix F.5 and F.7 categorizes failures (assertion errors, no-assert errors, timeout errors, value errors) with qualitative examples. Section 4.2 discusses specific failure modes." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Key negative findings: models add virtually 0% coverage in last/extra test completion; context window doesn't help much for test generation (Figure 8a); CodeLlama 70B underperforms smaller models. Appendix F.8 shows 'no solve' cases." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims (GPT-4o at 35.2% coverage, models struggle with high-coverage suites, assertion errors on complex paths) are all directly supported by Tables 2-3 and the error analysis in Section 4 and Appendix F." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper is primarily descriptive (benchmark + evaluation). Causal-style claims are modest and supported by controlled ablations: e.g., context window effects (Section 4.1.3) use the same model with varying context sizes." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The benchmark covers only Python across 11 repositories, but the title and framing suggest general 'unit test generation' capabilities. The Limitations section (Section 5, Appendix H) notes SWEBench overfitting risk but doesn't strongly bound claims to Python or these specific project types." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 5 and Appendix H discuss data contamination, SWEBench overfitting, prompt/temperature sensitivity, and file-level context limitations as alternative explanations for observed results." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly discusses that coverage and mutation score are proxies for test quality (Appendix H: 'Quantitative metrics such as code coverage and mutation score can approximate the quality of generated tests, however they do not perfectly measure the quality'). They also discuss the oracle problem." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix B provides exact HuggingFace model URLs for all open-source models. GPT-4o is dated to 08/29/2024. Specific model names like 'Meta-Llama-3.1-8B-Instruct' are given." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text for both test generation (Figure 15) and test completion (Figure 16) is provided in Appendix C." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Temperature is specified: 0.2 for pass@1, 0.8 for pass@5 (Tables 2-3). Context window handling is described (maximum possible, truncate starting tokens)." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. Models are prompted directly to generate test code." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 2.1 and Figure 2 describe the full pipeline: modifying SWEBench images, extracting test file pairs via heuristic matching, filtering by coverage computation and 60-second timeout." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5 is titled 'Limitations' with substantive discussion. Appendix H provides an expanded 'Limitations' section with additional detail." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats discussed: overfitting to SWEBench repositories, data contamination (with perplexity measurements), compute cost of mutation score, prompt/temperature sensitivity (Appendix H), file-level context limitation, lack of agentic baselines." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Appendix H states specific boundaries: file-level only (not repository-level), Python only, no agentic baselines, 0-shot only, and acknowledges cross-file dependencies are not captured." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "All benchmark data, docker images, model generations, and evaluation code are released via figshare. A website with all model generations is also provided." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2.1 describes the full benchmark construction process: starting from SWEBench docker images, modifying broken images, extracting file pairs via heuristic matching, filtering criteria." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The data source is SWEBench, a standard benchmark derived from public GitHub repositories." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Figure 2 shows the full pipeline with three stages. Table 4 shows the distribution across repositories (1,210 pairs from 11 repos). Filtering criteria are explicit (coverage computes, runs in 60 seconds)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or grant information is disclosed. Authors are affiliated with FAIR/Meta AI and CMU but no funding sources are mentioned." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Carnegie Mellon University and FAIR, Meta AI. Meta develops Llama and CodeLlama models which are evaluated in the paper." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Authors are from Meta AI (FAIR), which develops several of the evaluated models (Llama 3.1, CodeLlama). Meta has a financial interest in their models performing well on benchmarks." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the evaluated models. The paper uses GPT-4o dated to 08/29/2024 (evaluation date) but not its training cutoff." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Section 5 discusses data contamination: 'we measure perplexity of 10 randomly selected tests in TESTGENEVAL for Llama 3.1 8B and common frequent, and non recent code from GitHub' finding perplexity of common code lower than TestGenEval tests (1.6 vs 2.0)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section 5 addresses contamination risk directly, arguing the benchmark is unlikely contaminated based on perplexity analysis and universally low model performance. Also notes SWEBench overfitting risk." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, API costs, or per-example costs are reported despite evaluating 10 models across thousands of examples at multiple temperatures." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget is stated. The paper mentions mutation score compute cost as a limitation but does not quantify the total compute used for the evaluation." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results are reported at fixed temperatures (0.2 and 0.8) with no seed sensitivity analysis. Main results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "For pass@k metrics: 1 generation for pass@1 at temp=0.2, 5 generations for pass@5 at temp=0.8. Section F.3 reports up to 20 samples for coverage@k and 100 for pass@k analysis." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is reported. The paper uses fixed temperatures (0.2, 0.8) and maximum context windows without discussing whether these were tuned or how they were selected." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "Temperature values of 0.2 and 0.8 are used without justification for why these specific values were chosen over alternatives." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Appendix G reports pairwise comparisons across 10 models and 4 settings but no multiple comparison correction (Bonferroni, etc.) is mentioned." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Meta AI authors evaluate Meta's own Llama and CodeLlama models alongside competitors. No acknowledgment of potential bias in implementation or evaluation setup favoring their own models." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Models of vastly different sizes (7B to 405B) are compared without discussing compute costs. GPT-4o's compute budget is unknown but likely much larger than smaller open-source models." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 2.3 and Table 1 explicitly discuss how TestGenEval's properties (file-level, human-written tests, mutation score) relate to real-world test generation. The paper argues mutation score is more correlated with real fault detection than coverage alone, citing Just et al. (2014)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. All models are evaluated via direct prompting with the same prompt templates." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "Section 5 addresses temporal leakage through perplexity analysis comparing TestGenEval tests against common GitHub code, finding TestGenEval has higher perplexity (2.0 vs 1.6), suggesting tests are not memorized." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information through context. The prompts provide imports and code under test, but whether this constitutes unrealistic information leakage compared to real-world usage is not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the 11 SWEBench repositories overlap with model training data at the repository level, or whether file pairs from the same repository are non-independent." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "A concrete leakage detection method is used: perplexity measurement on 10 randomly selected tests compared against common GitHub code for Llama 3.1 8B (Section 5)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "GPT-4o achieves the best coverage (35.2%) and mutation score (18.8%) among all evaluated models on full test suite generation.", 365 "evidence": "Table 2 (Section 3.1) shows GPT-4o at 35.2% coverage and 18.8% mutation score, compared to Llama 3.1 405B at 35.0%/16.4%.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Models struggle to generate high-coverage test suites for real-world projects, with significantly lower performance than on self-contained benchmarks like TestEval.", 370 "evidence": "Table 2 shows max 35.2% coverage vs near 100% on TestEval (Section 1). Weak positive correlation with TestEval (Section 4.1.1, Figure 6b).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Test completion is significantly easier than full test generation, but models add virtually no new coverage when completing the last test in an existing suite.", 375 "evidence": "Table 3 shows pass@5 up to 74.3% for last test completion, but coverage improvement is near 0% (+Cov column). Table 8 confirms this across all models.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Models primarily struggle with reasoning about execution, leading to frequent assertion errors.", 380 "evidence": "Appendix F.5 (Figure 31) shows assertion errors are the most common error type for Codestral 22B. Qualitative analysis in Section 4.2 and F.7 provides specific examples.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "TestGenEvalLite (160 pairs) approximates the full benchmark metrics.", 385 "evidence": "Appendix E.2 shows similar trends in TestGenEvalLite vs TestGenEval. Appendix G reports statistical significance for both splits.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Data contamination is unlikely to be a major issue for TestGenEval.", 390 "evidence": "Perplexity analysis on 10 randomly selected tests shows higher perplexity than common GitHub code (2.0 vs 1.6) for Llama 3.1 8B (Section 5).", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Company evaluating own models", 397 "detail": "Meta AI (FAIR) authors evaluate Meta's Llama and CodeLlama models. While GPT-4o outperforms them, the benchmark design choices (Python only, SWEBench-derived, specific prompt format) could inadvertently favor certain model families." 398 }, 399 { 400 "flag": "Weak contamination analysis", 401 "detail": "Data contamination analysis uses only 10 randomly selected tests with perplexity for a single model (Llama 3.1 8B). This is a very small sample and a single detection method for a benchmark derived from well-known public repositories." 402 }, 403 { 404 "flag": "No compute costs reported", 405 "detail": "Evaluating 10 models across 1,210 examples with mutation testing is computationally expensive, yet no costs are reported. This limits reproducibility assessment." 406 }, 407 { 408 "flag": "Single-run main results", 409 "detail": "Main results at temperature=0.2 appear to be single-run without variance estimates. The statistical tests in Appendix G partially address this but main tables lack uncertainty." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Evaluating large language models trained on code", 415 "authors": ["Mark Chen", "Jerry Tworek"], 416 "year": 2021, 417 "arxiv_id": "2107.03374", 418 "relevance": "Introduces HumanEval, the foundational code generation benchmark that TestGenEval compares against." 419 }, 420 { 421 "title": "SWE-bench: Can language models resolve real-world github issues?", 422 "authors": ["Carlos E Jimenez", "John Yang"], 423 "year": 2024, 424 "relevance": "TestGenEval is built on SWEBench's infrastructure and docker images, making this a direct methodological dependency." 425 }, 426 { 427 "title": "TestEval: Benchmarking large language models for test case generation", 428 "authors": ["Wenhan Wang", "Chenyuan Yang"], 429 "year": 2024, 430 "arxiv_id": "2406.04531", 431 "relevance": "Existing test generation benchmark on LeetCode problems that TestGenEval positions against as being too simple/self-contained." 432 }, 433 { 434 "title": "Code agents are state of the art software testers", 435 "authors": ["Niels Mündler", "Mark Niklas Müller"], 436 "year": 2024, 437 "arxiv_id": "2406.12952", 438 "relevance": "SWT-Bench: adjacent benchmark for test method generation targeted at bug-fixing PRs." 439 }, 440 { 441 "title": "R2e: Turning any github repository into a programming agent environment", 442 "authors": ["Naman Jain", "Manish Shetty"], 443 "year": 2024, 444 "relevance": "Repository-level executable environment for evaluating coding agents, uses equivalence test harnesses." 445 }, 446 { 447 "title": "CruxEval: A benchmark for code reasoning, understanding and execution", 448 "authors": ["Alex Gu", "Baptiste Rozière"], 449 "year": 2024, 450 "arxiv_id": "2401.03065", 451 "relevance": "Executable code benchmark measuring execution reasoning ability, a subset of the test generation task." 452 }, 453 { 454 "title": "The Llama 3 herd of models", 455 "authors": ["Abhimanyu Dubey"], 456 "year": 2024, 457 "arxiv_id": "2407.21783", 458 "relevance": "Source of Llama 3.1 models (8B/70B/405B) evaluated as primary open-source baselines." 459 }, 460 { 461 "title": "DeepSeek-Coder-V2: Breaking the barrier of closed-source models in code intelligence", 462 "authors": ["DeepSeek-AI"], 463 "year": 2024, 464 "arxiv_id": "2406.11931", 465 "relevance": "Code-specialized model evaluated as a medium-sized baseline in the benchmark." 466 }, 467 { 468 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 469 "authors": ["Terry Yue Zhuo"], 470 "year": 2024, 471 "arxiv_id": "2406.15877", 472 "relevance": "Contemporary code generation benchmark with execution-based evaluation." 473 }, 474 { 475 "title": "CAT-LM training language models on aligned code and tests", 476 "authors": ["N. Rao", "K. Jain"], 477 "year": 2023, 478 "relevance": "Software testing model that measures test completion at method level, directly related to TestGenEval's test completion task." 479 }, 480 { 481 "title": "TOGA: A neural method for test oracle generation", 482 "authors": ["Elizabeth Dinella", "Gabriel Ryan"], 483 "year": 2022, 484 "relevance": "Neural test oracle generation benchmark focused on assertion completion." 485 }, 486 { 487 "title": "RepoBench: Benchmarking repository-level code auto-completion systems", 488 "authors": ["Tianyang Liu", "Canwen Xu"], 489 "year": 2023, 490 "arxiv_id": "2306.03091", 491 "relevance": "Repository-level code completion benchmark relevant to understanding context effects on code generation." 492 } 493 ] 494 }