scan.json (25830B)
1 { 2 "paper": { 3 "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM", 4 "authors": [ 5 "Gabriel Ryan", 6 "Siddhartha Jain", 7 "Mingyue Shang", 8 "Shiqi Wang", 9 "Xiaofei Ma", 10 "Murali Krishna Ramanathan", 11 "Baishakhi Ray" 12 ], 13 "year": 2024, 14 "venue": "Proc. ACM Softw. Eng. (FSE)", 15 "arxiv_id": "2402.00097", 16 "doi": "10.1145/3643769" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "Section 1 contribution (2) states: 'SymPrompt is currently a proprietary research prototype and we are working with our legal team to make the code publicly available.' This is a promise of future release, not an actual release." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The benchmark is drawn from 26 open source projects used in prior work (Section 4, Benchmark Programs), but the specific set of 897 focal methods selected and the benchmark configuration are not released as a downloadable artifact." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Section 4 mentions 'AWS p4d.24xl instance with 8 Nvidia A100 GPUs and 96 vCPUs' and 'Python 10 and Pytorch 1.13 with the Huggingface transformers framework,' but there is no requirements.txt, Dockerfile, or detailed environment specification listing library versions sufficient to recreate the environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided. The code is proprietary, and no README or reproduction guide is mentioned." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper reports only point estimates in Tables 1 and 2 (e.g., Pass@1 = 0.41, Line Cov. = 0.48) without confidence intervals or error bars. The only standard deviation mentioned is for Pynguin's baseline coverage (12.7%), not for SymPrompt results." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper makes comparative claims (e.g., '5× improvement', '26% relative coverage improvement') but reports no statistical significance tests (no p-values, t-tests, or any statistical tests)." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Effect sizes are reported as relative improvements with baselines: 'enhances correct test generations by a factor of 5 and bolsters relative coverage by 26%' (abstract), 'from 36% to 74%' (Section 4.4), and detailed point estimates in Tables 1 and 2 provide context for the magnitude of improvements." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for why 897 focal methods were used, or why 100 were sampled for the ablation (RQ3), or why only 3 unseen projects were used for RQ2. No power analysis is provided." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Section 4.1 states '10 generations with CodeGen2 and averaged the results,' but no standard deviation, variance, or spread measure is reported across these runs. Tables show only averages." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Multiple baselines are included: No-Op tests, a baseline test completion prompt from prior work (Lemieux et al.), Pynguin SBST tool, and for GPT-4 a describe-generate baseline from prior work." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include Pynguin (SBST), CodeGen2 with standard prompts from Lemieux et al. (2023), and ChatTester-style describe-generate prompts from Yuan et al. (2023). These are contemporary for the paper's submission period (2023-2024)." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "RQ3 (Section 4.3) presents an ablation study evaluating the contributions of path constraint prompts and calling context independently, showing results for 'Constraints Only', 'Context Only', and full SymPrompt in Table 2." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Five metrics are used: Pass@1, FM Call@1, Correct@1, Line Coverage, and Branch Coverage, as described in Section 4 (Evaluation Metrics)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No systematic human evaluation of generated tests is conducted. Section 4.4 mentions 'manually examining 10 generated tests' for path-following accuracy, but this is a tiny ad hoc inspection, not a proper human evaluation study. The paper makes claims about test quality that could benefit from human assessment." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "The ablation study (RQ3) uses '100 randomly sampled focal methods from the benchmark used in RQ1.' There is no explicit held-out test set; the same benchmark is used throughout, and no dev/test separation is described." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": false, 102 "justification": "Results are reported as averages across all 897 focal methods. No per-project, per-complexity-level, or per-category breakdown is provided in the main results. The 'unseen projects' evaluation in RQ2 is a coarse split, not a detailed breakdown." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 4.4 discusses failure cases: '4 cases are either due to deeply nested branching or exception handling — the model either did not generate correct preconditions for deeply nested branches or error-handling paths.' Section 4.1 also discusses how errors in model generations prevent test suites from executing." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 4.4 reports that 'using the path constraint prompts generated by GPT-4 in isolation did not lead to significant performance improvements over the baseline describe and generate prompts.' The RQ2 results also show lower performance on unseen projects." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims '5× correct test generations' and '26% relative coverage' for CodeGen2, and '2× coverage' for GPT-4. These are supported by Table 1 (Correct@1: 0.03→0.15 ≈ 5×, Line Cov: 0.38→0.48 ≈ 26% relative) and Table 2 (Line Cov: 0.36→0.74 ≈ 105% relative ≈ 2×)." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims via ablation ('path constraint prompts are crucial to the performance improvements'). The ablation design in RQ3 is a controlled single-variable manipulation: removing one component at a time from SymPrompt and measuring the effect. This is adequate for these claims." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title says 'using LLM' (singular, generic) and the paper tests only on Python with two models. Section 5 acknowledges 'our evaluations focus on open source Python projects and utilize specific language models (CodeGen2 and GPT-4). This restricts the generalizability,' but then immediately undermines it: 'none of the methods are specific to Python. So we expect the findings will hold consistently in other settings.'" 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for the observed improvements. For instance, it does not consider whether the gains might be due to simply providing more tokens of context rather than the specific path-constraint structure, or whether the iterative prompting procedure alone (without path constraints) could account for improvements." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "CodeGen2 is identified as '16B parameter' (Section 4.1), and GPT-4 is referenced via the OpenAI 2023 technical report. However, no specific version or snapshot date is given for GPT-4 (e.g., 'gpt-4-0613'). CodeGen2 is specified by name and size but no exact checkpoint version." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Figure 3 shows the full SymPrompt generation structure. Figures 7 and 8 show the baseline prompts and the path constraint prompt construction in detail. Figure 8a-c shows the No-Op, baseline, and GPT describe-generate prompt formats. The actual prompt templates with their structural components are provided." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Section 4.1 states '10 generations with CodeGen2' but does not report temperature, top-p, max tokens, or other sampling parameters for either CodeGen2 or GPT-4." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The multi-stage scaffolding is described in detail: Section 3 covers the three-step pipeline (path constraint collection, context construction, iterative test generation). Figure 4 provides a workflow diagram. The iterative prompting procedure where previous generations are appended to subsequent prompts is explained." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4 (Benchmark Programs) describes the selection process: 897 focal methods from 26 open source projects from prior benchmarks, filtered to methods where Pynguin 'was unable to achieve full coverage during 10 runs.' The focal method selection criterion is clear." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 5 'Threats to Validity & Discussion' provides a dedicated section discussing multiple limitations: model and benchmark validity, memorization validity, metric validity, and regression setting limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5 discusses specific threats: 'Our evaluations focus on open source Python projects and utilize specific language models,' 'the possibility remains that CodeGen2 could have seen some of the focal methods,' and 'generated tests may not uncover any implementation bugs' due to the regression setting assumption." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "While Section 5 mentions some limitations, it does not explicitly state what the results do NOT show. Instead, it follows each limitation with a dismissal: e.g., 'we expect the findings will hold consistently in other settings.' There are no explicit boundary statements like 'our results do not demonstrate X.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data (individual test generations, per-method results) is made available. Only aggregated averages are reported in Tables 1 and 2." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 4 describes the data collection: benchmark programs from 26 open source projects used in prior work, selection criterion (methods where Pynguin could not achieve full coverage), and 10 generations per method per strategy." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants were involved. The study evaluates automated test generation on code benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: focal methods are selected from prior benchmarks based on Pynguin coverage criterion, AST traversal collects path constraints, prompts are constructed, models generate tests iteratively, tests are parsed and corrected for syntax errors, then executed to measure coverage. Section 3 and 4 provide these steps." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding acknowledgment section is present in the paper. The work was done at AWS AI Labs (corporate research), but no explicit funding disclosure is made." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: Gabriel Ryan (Columbia University, work done as AWS AI Labs intern) and all other authors at AWS AI Labs. The corporate affiliation is prominent." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "The work was conducted at AWS AI Labs. AWS/Amazon has a commercial interest in AI-powered developer tools and code generation. The funder (AWS) is not independent of the outcome — improved test generation supports AWS's product interests." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement or financial interest declarations are present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No explicit training data cutoff date is stated for either CodeGen2 or GPT-4. For CodeGen2, the paper mentions it was 'trained on a subset of the Stack' but gives no cutoff date. For GPT-4, no training cutoff is mentioned." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": true, 234 "justification": "RQ2 (Section 4.2) directly addresses train/test overlap: the authors use the AmIInTheStack tool to identify projects not in CodeGen2's training data and evaluate separately on those unseen projects." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": true, 239 "justification": "Section 4.2 addresses contamination by evaluating on projects confirmed not to be in CodeGen2's training data via the AmIInTheStack tool. Section 5 (Memorization Validity) further discusses this: 'we use the AmIInTheStack tool to prevent training data memorization from biasing our RQ2 results.'" 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants. The study evaluates automated test generation tools on code benchmarks." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants involved in the study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants involved in the study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants involved in the study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants involved in the study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants involved in the study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants involved in the study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost, API cost, tokens consumed, or latency is reported for either CodeGen2 or GPT-4. The approach involves multiple LLM calls per focal method (one per execution path) but the total cost is not quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The hardware is mentioned (AWS p4d.24xl with 8 A100 GPUs) but no total compute budget, GPU hours, wall-clock time, or API spend is reported." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "SymPrompt enhances correct test generations by a factor of 5 for CodeGen2.", 295 "evidence": "Table 1 shows Correct@1 improves from 0.03 (baseline) to 0.15 (SymPrompt) on 897 focal methods, a 5× improvement.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "SymPrompt improves relative line coverage by 26% for CodeGen2 over baseline prompts.", 300 "evidence": "Table 1 shows Line Coverage of 0.38 (baseline) vs. 0.48 (SymPrompt), a 26% relative improvement.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "When applied to GPT-4, SymPrompt improves coverage by over 2× compared to baseline prompting strategies.", 305 "evidence": "Table 2 RHS shows Line Coverage increasing from 0.36 (baseline) to 0.74 (SymPrompt), a 105% relative improvement. However, this is based on a subsample of 100 methods, not the full 897.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "SymPrompt performance holds on projects unseen in training data.", 310 "evidence": "Table 1 RHS shows SymPrompt still achieves 4× improvement in Correct@1 on unseen projects (0.03→0.12). However, only 3 unseen projects were identified, a very small sample.", 311 "supported": "weak" 312 }, 313 { 314 "claim": "Path constraint prompts contribute more to performance gains than calling context.", 315 "evidence": "Table 2 LHS ablation: Constraints Only (0.17 Correct@1, 0.49 Line Cov.) vs. Context Only (0.06 Correct@1, 0.42 Line Cov.) vs. SymPrompt (0.26 Correct@1, 0.53 Line Cov.).", 316 "supported": "moderate" 317 }, 318 { 319 "claim": "When test suites with errors are filtered out, SymPrompt compares favorably with Pynguin.", 320 "evidence": "Table 1: SymPrompt Filtered achieves 77% line coverage vs. Pynguin's 72%. However, this filters to only passing test suites, creating a strong selection bias — the comparison is not apples-to-apples.", 321 "supported": "weak" 322 } 323 ], 324 "methodology_tags": [ 325 "benchmark-eval" 326 ], 327 "key_findings": "SymPrompt, a code-aware prompting strategy that decomposes test generation into path-specific prompts using static analysis of execution paths, substantially improves LLM-generated test quality. On CodeGen2 (16B), SymPrompt achieves 5× more correct test generations and 26% higher line coverage than baseline prompts on 897 challenging Python methods. When applied to GPT-4, SymPrompt doubles test coverage by having the model generate its own path constraint descriptions. An ablation study shows path constraint prompts contribute more to improvement than calling context alone.", 328 "red_flags": [ 329 { 330 "flag": "No error bars or variance measures", 331 "detail": "Despite running 10 generations per method per strategy, no standard deviation, confidence intervals, or any uncertainty quantification is reported. All results are bare point estimates averaged across 897 methods." 332 }, 333 { 334 "flag": "No statistical significance tests", 335 "detail": "Claims of improvement (e.g., '5× more correct generations') are made purely by comparing two averages without any significance test, despite having the data to run such tests." 336 }, 337 { 338 "flag": "Selective comparison with Pynguin", 339 "detail": "The 'SymPrompt Filtered' comparison with Pynguin (Table 1) filters out failing test suites, creating a strong selection bias. Pynguin's numbers include all runs. This comparison is not methodologically sound and overstates SymPrompt's competitive position." 340 }, 341 { 342 "flag": "Very small unseen-projects sample", 343 "detail": "The memorization evaluation (RQ2) uses only 3 projects confirmed not in the training data. This is too small a sample to make meaningful claims about generalization to unseen code." 344 }, 345 { 346 "flag": "Corporate affiliation without conflict disclosure", 347 "detail": "All authors except the intern are employed by AWS AI Labs. AWS has commercial interest in AI code generation tools. No conflict of interest statement is provided." 348 }, 349 { 350 "flag": "Code not released", 351 "detail": "SymPrompt is described as a 'proprietary research prototype' with a vague promise of future release. This prevents independent reproduction and verification of results." 352 }, 353 { 354 "flag": "No hyperparameter reporting", 355 "detail": "Temperature, top-p, max tokens, and other sampling parameters are not reported for either CodeGen2 or GPT-4, despite these significantly affecting generation quality." 356 } 357 ], 358 "cited_papers": [ 359 { 360 "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models", 361 "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"], 362 "year": 2023, 363 "relevance": "Hybrid SBST-LLM approach for test generation; key baseline and inspiration for SymPrompt." 364 }, 365 { 366 "title": "Evaluating large language models trained on code", 367 "authors": ["Mark Chen", "Jerry Tworek"], 368 "year": 2021, 369 "arxiv_id": "2107.03374", 370 "relevance": "Introduces HumanEval benchmark and Pass@k metrics used widely in code generation evaluation." 371 }, 372 { 373 "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation", 374 "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"], 375 "year": 2023, 376 "arxiv_id": "2305.04207", 377 "relevance": "Evaluates ChatGPT for test generation with iterative repair; used as GPT-4 baseline approach." 378 }, 379 { 380 "title": "Effective Test Generation Using Pre-trained Large Language Models and Mutation Testing", 381 "authors": ["Arghavan Moradi Dakhel", "Amin Nikanjam"], 382 "year": 2023, 383 "arxiv_id": "2308.16557", 384 "relevance": "MuTAP: uses mutation testing to guide LLM test generation for bug detection." 385 }, 386 { 387 "title": "An empirical evaluation of using large language models for automated unit test generation", 388 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 389 "year": 2023, 390 "relevance": "Large-scale empirical study of LLM-based unit test generation quality and limitations." 391 }, 392 { 393 "title": "Adaptive test generation using a large language model", 394 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 395 "year": 2023, 396 "arxiv_id": "2302.06527", 397 "relevance": "TestPilot: iterative test generation with ChatGPT adding context on failure." 398 }, 399 { 400 "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool", 401 "authors": ["Zhuokui Xie", "Yinghao Chen"], 402 "year": 2023, 403 "arxiv_id": "2305.04764", 404 "relevance": "ChatGPT-based test generation with adaptive focal context and self-debugging." 405 }, 406 { 407 "title": "Chain-of-thought prompting elicits reasoning in large language models", 408 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 409 "year": 2022, 410 "relevance": "Foundational work on multi-step reasoning prompting that motivates SymPrompt's approach." 411 }, 412 { 413 "title": "Tree of thoughts: Deliberate problem solving with large language models", 414 "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"], 415 "year": 2023, 416 "arxiv_id": "2305.10601", 417 "relevance": "Advanced prompting strategy for multi-step reasoning with LLMs." 418 }, 419 { 420 "title": "Codegen2: Lessons for training llms on programming and natural languages", 421 "authors": ["Erik Nijkamp", "Hiroaki Hayashi"], 422 "year": 2023, 423 "arxiv_id": "2305.02309", 424 "relevance": "The primary open-source code model evaluated in this paper." 425 }, 426 { 427 "title": "Unit test case generation with transformers and focal context", 428 "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy"], 429 "year": 2020, 430 "arxiv_id": "2009.05617", 431 "relevance": "AthenaTest: foundational work on focal context for transformer-based test generation." 432 }, 433 { 434 "title": "A3Test: Assertion-Augmented Automated Test Case Generation", 435 "authors": ["Saranya Alagarsamy", "Chakkrit Tantithamthavorn", "Aldeida Aleti"], 436 "year": 2023, 437 "arxiv_id": "2302.10352", 438 "relevance": "Post-processing approach to correct test naming errors in LLM-generated tests." 439 } 440 ] 441 }