scan.json (30367B)
1 { 2 "paper": { 3 "title": "ProjectTest: A Project-level LLM Unit Test Generation Benchmark and Impact of Error Fixing Mechanisms", 4 "authors": [ 5 "Yibo Wang", 6 "Congying Xia", 7 "Wenting Zhao", 8 "Jiangshu Du", 9 "Chunyu Miao", 10 "Zhongfen Deng", 11 "Philip S. Yu", 12 "Chen Xing" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2502.06556", 17 "doi": "10.48550/arXiv.2502.06556" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "ProjectTest introduces a project-level unit test generation benchmark covering 60 projects across Python, Java, and JavaScript, evaluated on 9 frontier LLMs. Java is the hardest language for all models due to strict syntax, with compilation rates as low as 0% for some open-source models. Manual fixing of compilation and cascade errors produces dramatic improvements (e.g., CodeQwen1.5 goes from 0% to 60% correctness on Java), revealing that basic errors mask LLMs' underlying test generation capabilities. LLM self-fixing generally lags behind manual fixing, with open-source models particularly struggling to self-correct.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper states 'Our code and dataset is available at ProjectTest' in the abstract, referencing a public release. The linked resource appears to be a repository." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper states 'Our code and dataset is available at ProjectTest' in the abstract. The 60 projects and their metadata are described as publicly available." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions '8 NVIDIA A100 GPUs' and names testing frameworks (Pytest, Jacoco, JEST) but provides no requirements.txt, Dockerfile, or detailed dependency/version listing for reproducing the evaluation environment." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology is described at a high level (Section 3) but lacks specific commands or scripts to replicate the evaluation pipeline." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 2, 3, and 4 are reported as point estimates (e.g., '47%', '65%') with no confidence intervals, error bars, or uncertainty measures." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper makes numerous comparative claims (e.g., 'GPT-o1 performs the best in general', 'Java is the most difficult language') based solely on comparing raw numbers without any statistical significance tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Tables 3 and 4 report improvements with both the new value and the delta, e.g., '74% (+27%)' and '43% (-16%)', providing baseline context for interpreting the magnitude of changes from vanilla to manual-fixed and self-fixed scenarios." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The choice of 20 projects per language (60 total) is not justified. No power analysis or sample size rationale is provided. The paper does not discuss whether 20 projects is sufficient for the comparative claims being made." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance or standard deviation is reported. Temperature is set to 0 for deterministic inference, but no discussion of API non-determinism or variance across runs. All results are single-run point estimates." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Nine LLMs are compared against each other, including 5 closed-source and 4 open-source models. Three evaluation scenarios (vanilla, manual fixing, self-fixing) serve as comparisons against each other." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "The evaluation includes contemporary models: GPT-o1, Claude-3.5-Sonnet (2024-10-22), and Gemini-2.0-Flash-Exp. Open-source models include CodeQwen1.5, DeepSeek-Coder, CodeLlama, and CodeGemma, which are somewhat older but still relevant." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Appendix D.1 (Table 11) provides a prompt ablation study removing individual prompt components (PL, CR, ComR, Coverage) and testing their impact on GPT-4-Turbo's performance. Appendix D.2 ablates compilation vs. cascade error fixing." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Four evaluation metrics are used: Compilation Rate (ComR), Correctness Rate (CR), Line Coverage (LC), and Branch Coverage (BC). Additionally, unique contribution analysis is performed in Section 5.4." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of the generated unit tests' quality is performed. The manual fixing scenario involves humans correcting errors but not evaluating test quality, readability, or meaningfulness." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "No explicit dev/test split exists. The prompts were 'carefully designed' (Section 3.3) and the ablation study (Table 11) was conducted on the same 20 Python projects used for main evaluation. There is no separation between prompt development and final evaluation data." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by programming language (Python, Java, JavaScript), by model (9 models), and by evaluation scenario (vanilla, manual-fixed, self-fixed). Tables 2-4 provide comprehensive per-language, per-model breakdowns." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 5.5 and Appendix E provide extensive error analyses covering compilation errors, cascade errors, and post-fix errors per language, with specific examples (Figures 4, 5, 10-12)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Several negative results are reported: LLM self-fixing often degrades performance compared to manual fixing (Table 4), coverage-related prompts are 'not always beneficial' (Table 11), and open-source models completely fail on Java (0% compilation for CodeQwen1.5, CodeLlama, CodeGemma)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims about moderate LLM performance, benchmark difficulty, significant compilation errors in frontier LLMs, and self-fixing lagging behind manual fixing are all supported by Tables 2-4 and the error analyses in Section 5.5." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper makes causal claims without adequate justification. For example, 'Java is the most difficult language, primarily due to stricter syntax' (Section 5.1) attributes difficulty to a specific cause without controlling for confounds like project complexity or model training data composition." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "Claims are generally bounded to the tested setting. The paper refers to 'all tested frontier LLMs' rather than all LLMs, and findings are specific to ProjectTest. The limitations section acknowledges the restriction to three languages and moderate-sized projects." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for its findings. For example, performance differences across languages could be explained by training data composition rather than syntax strictness, but this is not considered. No robustness checks are performed." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper's claims match the granularity of its measurements. It measures compilation rate, correctness rate, and coverage rate and frames results in those terms without inflating claims to broader constructs like 'code quality' or 'developer productivity'." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Claude-3.5-Sonnet-20241022 is fully specified. Open-source models have exact HuggingFace names. However, GPT-4-Turbo, GPT-3.5-Turbo, and GPT-o1 lack snapshot dates or API version identifiers. Gemini-2.0-Flash-Exp is partially specified. Multiple key models lack reproducible version information." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompts are provided for all three languages in Figures 3, 7, 8 (vanilla generation), Figure 6 (self-fixing), and Figure 9 (comment-sign variant). The actual text sent to the models is given, not just descriptions." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4.2 states: 'The temperature is set to 0 during inference.' Maximum input length is configured per model. Hardware (8 NVIDIA A100 GPUs) is specified." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The evaluation uses direct zero-shot prompting of LLMs without any agent, tool use, or multi-step pipeline." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 3.1 (Pre-processing) documents the preprocessing steps: syntax checking, making extracted projects self-contained, reorganizing files, adjusting import paths, consolidating multi-line statements, and preserving original code style." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing language coverage restrictions and project size constraints." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The limitations mention specific threats: focusing on only three programming languages (missing C, C#), selecting moderate-sized projects due to LLM input length restrictions, which limits exploration to robustness issues rather than long-context handling." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The paper explicitly states what is not tested: other programming languages, larger projects, and long-context handling. It bounds the focus to 'issues like the robustness of LLMs in unit test generation (e.g., hallucinations or incorrect assertions) rather than focusing solely on their ability to handle long-context inputs.'" 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper states code and dataset are available at ProjectTest. The raw projects and generated unit tests should be accessible for verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 3.1 describes the data collection procedure in detail: projects collected from GitHub, filtered by size (2-15 files, <1600 LOC), inter-file dependencies, public licenses, and high star/fork counts. Tables 6-8 list every project with stars, forks, license, and links." 201 }, 202 "recruitment_methods_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "The project selection method is described: GitHub repositories filtered by size, dependencies, license type, and community metrics (stars, forks). Some projects were extracted from larger codebases and adjusted to be self-contained. Appendix A provides per-project details." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline from collection to evaluation is documented: GitHub filtering → preprocessing (syntax checks, self-containment adjustments, multi-line consolidation) → prompt construction → LLM evaluation → metric computation. Section 3 covers each stage." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding source is disclosed in the paper. There is no acknowledgments section listing grants or sponsors, despite authors being affiliated with a university (UIC), Salesforce Research, and Scale AI." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: University of Illinois Chicago, Salesforce Research, and Scale AI. These are prominently displayed in the paper header." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding is disclosed, making independence impossible to assess. Authors at Salesforce Research and Scale AI (companies with interests in LLM capabilities and evaluation) could have interests in the outcomes, but no funding relationship is stated." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper. Authors at Scale AI (an AI evaluation company) and Salesforce Research have potential interests related to the findings but these are not declared." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for any of the 9 evaluated models. This is critical because the benchmark projects are from popular GitHub repositories that are likely in training data." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether the benchmark projects (many from repos with 2,900-103,000 stars) appeared in any model's training data. This is a significant omission given the high visibility of these repositories." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "Benchmark projects come from highly popular GitHub repos (e.g., three.js with 103,000 stars, keras preprocessing with 1,024 stars). These were certainly available before training cutoffs of the tested models. No contamination analysis is provided." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. The evaluation is entirely automated using LLM-generated unit tests." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. The study evaluates LLMs on code generation tasks." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants. The manual fixing was performed by the authors but demographics are not relevant as this is not a human subjects study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants or randomized experimental conditions." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants or conditions requiring blinding." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, API spend, tokens consumed, or latency is reported for any of the 9 models evaluated. The paper runs hundreds of LLM evaluations without mentioning cost." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Only hardware is mentioned ('8 NVIDIA A100 GPUs' in Section 4.2). No total GPU hours, API spend, or wall-clock time for the experiments is provided." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No seed sensitivity analysis is performed. Temperature is set to 0 for deterministic inference, but API non-determinism is not addressed and no multi-run analysis is provided." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is not explicitly stated. The use of temperature 0 implies single runs, but this is not confirmed. The paper presents results without stating how many times each experiment was conducted." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is reported. The prompts were 'carefully designed' but no information is given about how many prompt variants were tried before selecting the final prompts." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The 'Full Prompt' configuration is used as the default but selection is not justified on a separate validation set. The prompt ablation (Table 11) is conducted on the same test data, making it unclear whether the prompt was selected based on test set performance." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite many comparisons across 9 models, 3 languages, 3 scenarios, and 4 metrics." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors do not acknowledge potential bias in benchmark construction. The selection of projects and design of prompts could favor certain models, but this is not discussed." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "GPT-o1 (a reasoning model with higher compute) is compared directly to smaller models like CodeGemma-7B without any discussion of compute budget differences. Performance is not contextualized against compute cost." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper does not discuss whether compilation rate, correctness rate, and coverage rate on 20 moderate-sized projects adequately capture 'project-level unit test generation capability'. No comparison with alternative evaluation approaches." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is used. All models are evaluated with direct zero-shot prompting in the same pipeline." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. The benchmark projects are from well-known GitHub repositories (some with 100k+ stars) that certainly existed before the training cutoffs of all tested models. LLMs may have seen these projects and their existing unit tests during training." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not discussed. The evaluation provides the full project source code as input, but if models memorized these projects during training, they could reproduce or approximate known test suites rather than generating novel tests." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "Not discussed. Multiple projects come from the same repositories: 8 Python card game projects from the same repo (2,937 stars), 4 stock-related Python projects from the same book repo (10,700 stars). This creates non-independence between test instances." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, temporal splits, or decontamination of any kind." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "All frontier LLMs exhibit moderate performance on ProjectTest on Python and Java, highlighting its difficulty.", 374 "evidence": "Table 2 shows correctness rates of 13-64% for Python and 0-53% for Java across 9 models. Compilation rates range from 50-70% for Python and 0-75% for Java.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Java is the most difficult language for LLM unit test generation, primarily due to stricter syntax.", 379 "evidence": "Table 2 shows consistently lower scores for Java across all metrics and models. Three open-source models (CodeQwen1.5, CodeLlama, CodeGemma) achieve 0% on all Java metrics.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "GPT-o1 performs the best in general, especially in JavaScript.", 384 "evidence": "Table 2 shows GPT-o1 achieves 87% CR and 87% LC on JavaScript, far ahead of other models. It leads or is near the top across most language-metric combinations.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Manual fixing of compilation and cascade errors reveals significant improvement potential for LLM-generated unit tests.", 389 "evidence": "Table 3 shows all models reach 100% compilation rate after fixing. Improvements are dramatic: CodeQwen1.5 goes from 0% to 60% CR on Java (+60%), Gemini-2.0-Flash improves from 14% to 54% LC on Java (+40%).", 390 "supported": "strong" 391 }, 392 { 393 "claim": "LLM self-fixing capabilities lag behind manual fixing in quality and reliability.", 394 "evidence": "Table 4 shows self-fixing performs worse than manual fixing across almost all models and languages. For example, GPT-o1 Python CR drops from 89% (manual) to 67% (self-fix), and CodeLlama Python drops from 31% to 0%.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Different LLMs have different language-level expertise.", 399 "evidence": "Table 2: Claude-3.5-Sonnet ranks first in Java (53% CR, 75% ComR) while GPT-o1 ranks first in JavaScript (87% CR, 95% ComR). In Python, Claude-3.5-Sonnet leads on CR (64%) while GPT-o1 leads on coverage (56% LC).", 400 "supported": "strong" 401 }, 402 { 403 "claim": "LLM-generated unit tests have low unique contribution rates, indicating redundant test generation.", 404 "evidence": "Table 5 shows unique contributions ranging from 2.70% (CodeGemma) to 11.40% (Claude-3.5-Sonnet) on Python, indicating high redundancy across all models.", 405 "supported": "moderate" 406 } 407 ], 408 "red_flags": [ 409 { 410 "flag": "Severe contamination risk from popular repositories", 411 "detail": "Benchmark projects come from highly popular GitHub repositories (e.g., three.js with 103,000 stars, keras preprocessing with 1,024 stars, svm with 10,800 stars). All tested LLMs were almost certainly trained on these codebases, likely including their existing unit tests. This contamination risk is never discussed, and could mean models are partially recalling memorized tests rather than generating novel ones." 412 }, 413 { 414 "flag": "Non-independent test instances from shared repositories", 415 "detail": "Multiple projects are extracted from the same repositories: 8 Python card game projects from the same RLCard repo (2,937 stars), 4 stock-related projects from the same book repo (10,700 stars), and multiple JavaScript projects from three.js (103,000 stars). Performance on these projects is likely correlated, inflating the effective sample size of 20 projects per language." 416 }, 417 { 418 "flag": "No statistical tests for comparative claims", 419 "detail": "The paper makes numerous ranking and comparison claims across 9 models, 3 languages, and 3 scenarios, all based on comparing point estimates from 20 projects without any statistical significance testing. With such small sample sizes, observed differences could easily be due to chance." 420 }, 421 { 422 "flag": "No uncertainty quantification", 423 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance measures. Even with temperature 0, API non-determinism exists, and the small sample (20 projects) means results are sensitive to the specific project selection." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Evaluating large language models trained on code", 429 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 430 "year": 2021, 431 "arxiv_id": "2107.03374", 432 "relevance": "Introduces the HumanEval benchmark for evaluating LLM code generation, a foundational benchmark in the space." 433 }, 434 { 435 "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation", 436 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 437 "year": 2023, 438 "relevance": "Class-level code generation benchmark that ProjectTest extends to project level." 439 }, 440 { 441 "title": "TestEval: Benchmarking large language models for test case generation", 442 "authors": ["Wenhan Wang", "Chenyuan Yang", "Zhijie Wang"], 443 "year": 2024, 444 "relevance": "Function-level test generation benchmark directly related to evaluating LLM testing capabilities." 445 }, 446 { 447 "title": "DevBench: A comprehensive benchmark for software development", 448 "authors": ["Bowen Li", "Wenhan Wu", "Ziwei Tang"], 449 "year": 2024, 450 "arxiv_id": "2403.08604", 451 "relevance": "Software development benchmark that includes project-level unit testing but with limited scope, which ProjectTest aims to address." 452 }, 453 { 454 "title": "TestGenEval: A real world unit test generation and test completion benchmark", 455 "authors": ["Kush Jain", "Gabriel Synnaeve", "Baptiste Rozière"], 456 "year": 2024, 457 "arxiv_id": "2410.00752", 458 "relevance": "Real-world unit test generation benchmark for evaluating LLM capabilities." 459 }, 460 { 461 "title": "An empirical evaluation of using large language models for automated unit test generation", 462 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 463 "year": 2023, 464 "relevance": "Empirical evaluation of LLMs for unit test generation using iterative querying, directly related to the survey scope." 465 }, 466 { 467 "title": "Using large language models to generate junit tests: An empirical study", 468 "authors": ["Mohammed Latif Siddiq"], 469 "year": 2024, 470 "relevance": "Empirical study of LLM JUnit test generation with zero-shot strategies, directly comparable to ProjectTest's evaluation approach." 471 }, 472 { 473 "title": "GPT-4 technical report", 474 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 475 "year": 2023, 476 "arxiv_id": "2303.08774", 477 "relevance": "Technical report for GPT-4, one of the model families evaluated in this benchmark." 478 }, 479 { 480 "title": "Code Llama: Open foundation models for code", 481 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 482 "year": 2023, 483 "arxiv_id": "2308.12950", 484 "relevance": "Open-source code LLM evaluated in the benchmark, relevant to understanding code generation model capabilities." 485 }, 486 { 487 "title": "DeepSeek-Coder: When the large language model meets programming", 488 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 489 "year": 2024, 490 "relevance": "Open-source code LLM that outperforms some closed-source models on ProjectTest, relevant to code intelligence evaluation." 491 }, 492 { 493 "title": "SWT-Bench: Testing and validating real-world bug-fixes with code agents", 494 "authors": ["Niels Mündler", "Mark Niklas Mueller", "Jingxuan He", "Martin Vechev"], 495 "year": 2024, 496 "relevance": "Software testing benchmark using code agents, related to evaluating LLM capabilities in software engineering tasks." 497 }, 498 { 499 "title": "R2E: Turning any github repository into a programming agent environment", 500 "authors": ["Naman Jain", "Manish Shetty", "Tianjun Zhang"], 501 "year": 2024, 502 "relevance": "Framework for repository-level evaluation of programming agents, related to project-level code evaluation." 503 } 504 ] 505 }