scan.json (30005B)
1 { 2 "paper": { 3 "title": "LLM-Powered Test Case Generation for Detecting Bugs in Plausible Programs", 4 "authors": [ 5 "Kaibo Liu", 6 "Zhenpeng Chen", 7 "Yiyang Liu", 8 "Jie M. Zhang", 9 "Mark Harman", 10 "Yudong Han", 11 "Yun Ma", 12 "Yihong Dong", 13 "Ge Li", 14 "Gang Huang" 15 ], 16 "year": 2024, 17 "venue": "arXiv preprint", 18 "arxiv_id": "2404.10304", 19 "doi": "10.48550/arXiv.2404.10304" 20 }, 21 "scan_version": 3, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "TrickCatcher, a three-stage LLM-based differential testing approach (PUT-guided variant generation, generator-based input generation, diversity-driven oracle construction), achieves F1 scores of 41.31%, 42.35%, and 51.34% on TrickyBugs (C++), TrickyBugs (Python), and EvalPlus respectively, outperforming the best baseline (Differential Prompting Plus) by 1.66× F1. The approach generates up to 16× fewer false positives than baselines on correct programs. An ablation study confirms each component contributes, and a counterintuitive finding shows that even buggy program variants (23.2% of useful variants) can produce correct test oracles.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The abstract states: 'Code and data used are available at https://github.com/RinCloud/TrickCatcher.' A concrete repository URL is provided." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper uses two public datasets: TrickyBugs (MIT license) and EvalPlus (Apache 2.0), both publicly available. The GitHub repository also claims to include data." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "No requirements.txt, Dockerfile, or environment setup details are provided in the paper. Only the model name (gpt-3.5-turbo-0125) and the CYaRon library are mentioned." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided in the paper. The combinatorial repetition method is described in Appendix B but does not constitute runnable reproduction instructions." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Table 1 and all other result tables report point estimates only (e.g., 41.31% F1) with no confidence intervals, error bars, or ± notation despite claiming averaged results." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims TrickCatcher 'significantly outperforms' baselines (e.g., '1.66× F1') based solely on comparing numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "The paper reports relative improvements with full baseline context: '1.80×, 2.65×, and 1.66× the recall, precision, and F1 score' of DPP. Table 1 provides absolute values for both methods across all configurations." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification is given for why 366 human-written and 151 AI-generated plausible programs are sufficient. No power analysis is discussed." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "Despite describing a combinatorial averaging methodology in Appendix B (e.g., C(10,4)=210 rounds), the paper reports only averaged point estimates in all tables without any standard deviation, IQR, or spread measure." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Three baselines are compared: DirectChat (CHAT), Differential Prompting Plus (DPP), and Automated Program Repair (APR), described in Section 5.4." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Differential Prompting (Li et al., 2023) is cited as the state-of-the-art in LLM-based test case generation for bug detection. The baselines are recent and represent the current landscape." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 2 (Section 6.3, RQ3) presents an ablation study with 6 patterns showing contributions of each component: PUT-guided program generation, generator-based input generation, and diversity-driven differential testing." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Three metrics are used: recall, precision, and F1 score, all reported in Table 1 and throughout the evaluation." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "Evaluation is entirely automated via comparison with canonical program outputs and official checkers. Manual input validity verification for TrickyBugs is a setup step, not evaluation of system outputs." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "TrickyBugs provides additional test cases beyond the existing suite, and EvalPlus provides extra test cases beyond base tests. These pre-defined ground-truth test sets are not used for any tuning decisions." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Results are broken down by dataset (TrickyBugs C++, TrickyBugs Python, EvalPlus), by number of variants k (2-10), and by task difficulty (RQ5, Figures 7-8)." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": false, 114 "justification": "No qualitative failure analysis or specific examples of where TrickCatcher fails are provided. The paper discusses aggregate false positive counts (RQ2) but does not examine individual failure cases." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper reports that the naive approach produces 40.10% invalid test inputs (Section 1). The ablation study (Table 2) shows that removing components degrades performance. Limitations of directly generating inputs and programs from specifications are discussed." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims 1.80×, 2.65×, and 1.66× improvements in recall, precision, and F1. Table 1 supports these with specific numbers (e.g., TrickCatcher F1 41.31% vs DPP 24.95% on TrickyBugs C++)." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "Causal claims about component contributions are supported by the ablation study (Table 2), which systematically varies components (PG, IG, DT) in a controlled manner, constituting adequate controlled single-variable manipulation." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title frames the approach broadly ('Detecting Bugs in Plausible Programs') but it is tested only on competition-level programming problems (AtCoder-style tasks in TrickyBugs and HumanEval-level tasks in EvalPlus). No explicit acknowledgment that results may not generalize to industrial software or non-specification-driven programs." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "No discussion of alternative explanations for why TrickCatcher outperforms baselines. Confounds such as the quality of prompts, model memorization of competition problems, or differences in compute cost between methods are not considered." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures precision, recall, and F1 for bug detection in plausible programs, which directly matches the claimed contribution. No proxy gap exists — the measurement and the framing align." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": true, 153 "justification": "The paper specifies 'gpt-3.5-turbo-0125' (Section 5.5), which includes a snapshot date identifier. DeepSeek-v3 is also named in the generalization experiment (Section 7.2)." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Figures 3 and 4 provide the actual prompt templates for program variant generation and test input generator generation, with placeholders for specification and code clearly indicated." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": false, 163 "justification": "No LLM API hyperparameters are reported — no temperature, top-p, max tokens, or sampling settings are mentioned for gpt-3.5-turbo-0125 or deepseek-v3." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "TrickCatcher is a sequential three-step pipeline (generate variants → generate inputs → differential testing), not an agentic scaffolding system with tools, retry logic, or memory." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5.2 describes how programs were selected: TrickyBugs provides 251 C++ and 115 Python plausible programs; EvalPlus programs were filtered from pre-generated LLM code samples to retain those passing base tests but failing extra tests, yielding 151 programs." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "A dedicated 'Limitations' section appears after Section 8 (Conclusion), discussing three specific limitations: budget constraints, LLM uncertainty, and data leakage risk." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "The limitations are specific to this study: gpt-3.5-turbo was chosen due to budget constraints (not ideal), multiple repetitions were used to mitigate LLM randomness, and TrickyBugs was released after the model's training cutoff to address data leakage." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to competition-level programming problems, does not acknowledge inapplicability to programs without clear formal specifications, and does not discuss what settings were excluded." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "The GitHub repository (https://github.com/RinCloud/TrickCatcher) claims to provide code and data. Both underlying datasets (TrickyBugs and EvalPlus) are publicly available." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 5.2 describes both datasets: TrickyBugs contains coding tasks from AtCoder with plausible programs from real participants; EvalPlus is a code generation benchmark with 164 tasks filtered to 151 with AI-generated plausible programs." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. Data comes from standard public benchmarks (TrickyBugs and EvalPlus)." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline is documented: datasets are described, filtering criteria for EvalPlus are stated (pass base tests, fail extra tests), program variant filtering via existing test suites is described, and the combinatorial repetition methodology is detailed in Appendix B." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "The Acknowledgements section lists: National Key R&D Program (2023YFB4503801), NSFC (62192733, 62192730), Hubei Province (2023BAA024), and ITEA Genius/GreenCode projects via InnovateUK." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All author affiliations are clearly listed: Peking University, Nanyang Technological University, King's College London, University College London." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": true, 229 "justification": "Funding comes from national science foundations and government R&D programs (NSFC, National Key R&D, InnovateUK) which have no financial stake in TrickCatcher's performance." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper does not state the training data cutoff date for gpt-3.5-turbo-0125. The Limitations section implies temporal awareness ('TrickyBugs was released after gpt-3.5-turbo-0125') but does not state the actual cutoff date." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": true, 246 "justification": "The Limitations section addresses this: 'the TrickyBugs dataset we used was released after gpt-3.5-turbo-0125, and EvalPlus explicitly prohibits its use for training LLMs. Moreover, the poor performance of the three LLM-based baselines further suggests that data leakage is not a main concern.'" 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": true, 251 "justification": "The Limitations section directly addresses contamination risk with two arguments: temporal ordering (TrickyBugs released after model) and licensing restriction (EvalPlus prohibits training use), plus empirical evidence (poor baseline performance suggests no memorization)." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study. Evaluation is automated using benchmark datasets." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No API costs, token counts, or wall-clock time are reported despite the method requiring multiple LLM calls per program (generating 10 variants + input generators). The paper mentions 'budget constraints' as a reason for choosing gpt-3.5-turbo but never quantifies the actual cost." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "No total computational budget, GPU hours, or API spend is reported. The paper acknowledges choosing gpt-3.5-turbo for cost reasons but provides no figures." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "Despite describing a combinatorial repetition approach in Appendix B, the paper does not report variance or sensitivity across runs. Only averaged point estimates appear in all result tables." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "Appendix B clearly states the repetition methodology: 100 test inputs sampled, 10 program variants sampled, with combinatorial selection (e.g., C(10,4)=210 rounds for k=4)." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "No hyperparameter search is described. The number of variants k is varied (2-10) as a parameter study but no search budget for other design decisions is reported." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": true, 322 "justification": "Table 1 reports results for all k values (2, 4, 6, 8, 10) rather than cherry-picking the best. The paper uses three comparison methods (Average, Best vs. Best, Worst vs. Worst) for comprehensive evaluation." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors implemented Differential Prompting Plus (DPP) themselves as a modified version of the original DP method. They do not acknowledge the potential bias of re-implementing baselines." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "No comparison of compute cost between methods. TrickCatcher and DPP both generate variants and inputs but may differ in total API calls; this is not analyzed." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether TrickyBugs and EvalPlus adequately capture the general problem of detecting bugs in plausible programs. Both are competition-level programming problems, which may not represent real-world software." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "No scaffolding is involved. All methods use the same LLM (gpt-3.5-turbo-0125) with different prompting strategies in a non-agentic pipeline." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": true, 354 "justification": "The Limitations section notes that 'the TrickyBugs dataset we used was released after gpt-3.5-turbo-0125,' addressing temporal ordering between training data and benchmark." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the evaluation setup leaks information through context. For instance, providing the PUT alongside the specification could leak information about the expected solution structure." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of whether TrickyBugs tasks or EvalPlus tasks share structural similarities with LLM training data, or whether tasks within the datasets are independent of each other." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No concrete leakage detection method is applied. The paper argues against contamination conceptually (temporal ordering, license restrictions, poor baseline performance) but uses no canary strings, membership inference, or n-gram overlap analysis." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "TrickCatcher achieves recall, precision, and F1 scores that are 1.80×, 2.65×, and 1.66× those of the state-of-the-art baseline (DPP).", 376 "evidence": "Table 1 shows TrickCatcher at k=10 achieves F1 of 41.31%, 42.35%, and 51.34% on TrickyBugs (C++), TrickyBugs (Python), and EvalPlus, compared to DPP's best F1 of 24.95%, 36.20%, and 35.76% (Section 6.1).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "TrickCatcher generates up to 16× fewer false positives for correct programs compared to state-of-the-art methods.", 381 "evidence": "Figure 5 (Section 6.2) shows false positive counts on EvalPlus correct programs: TrickCatcher generates 2.57-5.00 FPs vs DPP's 26.33-29.47 FPs.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Each component of TrickCatcher (PUT-guided program generation, generator-based input generation, diversity-driven differential testing) contributes to its final performance.", 386 "evidence": "Table 2 (Section 6.3) ablation study on TrickyBugs (C++) shows systematic degradation when components are replaced with basic alternatives (6 patterns compared).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "TrickCatcher's performance remains consistently stable and high with different numbers of program variants.", 391 "evidence": "Figure 6 (Section 6.4) shows TrickCatcher's precision and F1 remain stable as k changes from 2-10 on TrickyBugs (C++), unlike DPP which fluctuates.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "TrickCatcher demonstrates more significant improvement over DPP on harder coding tasks.", 396 "evidence": "Figures 7-8 (Section 6.5) show that on TrickyBugs, TrickCatcher performs comparably to DPP on easy tasks but clearly outperforms on difficult ones, with program variants having higher base-test passing rates on hard tasks.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Buggy program variants can also contribute to generating true positive test cases (23.2% in TrickyBugs, 15.0% in EvalPlus of useful variants are buggy).", 401 "evidence": "Section 7.1 reports that 23.2% (TrickyBugs) and 15.0% (EvalPlus) of variants that produced correct oracles for TP test cases were actually buggy.", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "TrickCatcher generalizes across different language models, with stronger models yielding better performance.", 406 "evidence": "Table 3 (Section 7.2) shows deepseek-v3 achieves 59.54% F1 on EvalPlus vs gpt-3.5-turbo's 51.34%, both using TrickCatcher.", 407 "supported": "weak" 408 } 409 ], 410 "red_flags": [ 411 { 412 "flag": "No uncertainty quantification despite averaged results", 413 "detail": "The paper describes an elaborate combinatorial repetition methodology (Appendix B) but reports only point estimates in all tables. With 210+ rounds per configuration, standard deviations could easily be reported but are absent, making it impossible to assess whether observed differences are meaningful or within noise." 414 }, 415 { 416 "flag": "No statistical significance tests", 417 "detail": "Claims of 'significantly outperforming' baselines are based purely on comparing averaged numbers. No statistical tests are performed despite the repetition methodology producing enough data for them." 418 }, 419 { 420 "flag": "Self-implemented modified baseline", 421 "detail": "The main baseline DPP is a self-modified version of Differential Prompting ('Differential Prompting Plus'), adapted by the authors for plausible programs. The original DP was not designed for this setting. The authors acknowledge modifying it but do not discuss whether their implementation may underperform the original authors' intent." 422 }, 423 { 424 "flag": "Ablation on single dataset only", 425 "detail": "The ablation study (Table 2) is reported only on TrickyBugs (C++) 'due to page limit.' Component contributions may differ across datasets and languages, but this is not verified." 426 }, 427 { 428 "flag": "No cost reporting despite budget-constrained model choice", 429 "detail": "The paper chose gpt-3.5-turbo specifically due to budget constraints but never quantifies the actual cost. The method requires generating 10 program variants + input generators per program, making cost a key practical consideration." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "Evaluating large language models trained on code", 435 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 436 "year": 2021, 437 "arxiv_id": "2107.03374", 438 "relevance": "Foundational benchmark (HumanEval/Codex) for LLM code generation evaluation." 439 }, 440 { 441 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 442 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 443 "year": 2023, 444 "relevance": "EvalPlus benchmark used as one of the two evaluation datasets; addresses rigor in LLM code evaluation." 445 }, 446 { 447 "title": "Nuances are the key: Unlocking ChatGPT to find failure-inducing tests with differential prompting", 448 "authors": ["Tsz-On Li", "Wenxi Zong", "Yibo Wang"], 449 "year": 2023, 450 "relevance": "State-of-the-art baseline (Differential Prompting) for LLM-based test case generation for bug detection." 451 }, 452 { 453 "title": "TrickyBugs: A dataset of corner-case bugs in plausible programs", 454 "authors": ["Kaibo Liu", "Yudong Han", "Yiyang Liu", "Jie M. Zhang"], 455 "year": 2024, 456 "relevance": "Primary evaluation dataset of human-written plausible programs with tricky bugs from online judge platforms." 457 }, 458 { 459 "title": "Who judges the judge: An empirical study on online judge tests", 460 "authors": ["Kaibo Liu", "Yudong Han", "Jie M. Zhang", "Zhenpeng Chen"], 461 "year": 2023, 462 "relevance": "Identifies 3,440 tricky bugs in human-written programs on online judge platforms, motivating the TrickCatcher approach." 463 }, 464 { 465 "title": "Large language model-based agents for software engineering: A survey", 466 "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen"], 467 "year": 2024, 468 "arxiv_id": "2409.02977", 469 "relevance": "Survey of LLM-based agents for software engineering tasks including test generation." 470 }, 471 { 472 "title": "Evaluating and improving ChatGPT for unit test generation", 473 "authors": ["Zhiqiang Yuan", "Mingwei Liu", "Shiji Ding"], 474 "year": 2024, 475 "relevance": "Evaluates LLM-based unit test generation (ChatTester), a related approach in the test generation space." 476 }, 477 { 478 "title": "An empirical evaluation of using large language models for automated unit test generation", 479 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 480 "year": 2024, 481 "relevance": "Empirical evaluation of LLMs (TestPilot) for automated unit test generation." 482 }, 483 { 484 "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM", 485 "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang"], 486 "year": 2024, 487 "relevance": "Coverage-guided LLM test generation (SymPrompt), a related approach combining code analysis with prompting." 488 }, 489 { 490 "title": "The counterfeit conundrum: Can code language models grasp the nuances of their incorrect generations?", 491 "authors": ["Alex Gu", "Wen-Ding Li", "Naman Jain"], 492 "year": 2024, 493 "relevance": "Studies whether code LLMs can distinguish correct from incorrect generated code, directly related to plausible program detection." 494 }, 495 { 496 "title": "B4: Towards optimal assessment of plausible code solutions with plausible tests", 497 "authors": ["Mouxiang Chen", "Zhongxin Liu", "He Tao"], 498 "year": 2024, 499 "relevance": "Addresses assessment of plausible code solutions, closely related to the plausible program testing problem." 500 } 501 ], 502 "engagement_factors": { 503 "practical_relevance": { 504 "score": 2, 505 "justification": "Practitioners working with LLM-generated code could use differential testing to find bugs, though the approach currently requires clear specifications." 506 }, 507 "surprise_contrarian": { 508 "score": 1, 509 "justification": "The diversity-driven oracle (trusting minority outputs over majority voting) is mildly counterintuitive but not paradigm-shifting." 510 }, 511 "fear_safety": { 512 "score": 0, 513 "justification": "No AI safety or security concerns raised; this is a testing/debugging tool." 514 }, 515 "drama_conflict": { 516 "score": 0, 517 "justification": "No controversy or provocative claims about existing methods or institutions." 518 }, 519 "demo_ability": { 520 "score": 2, 521 "justification": "Code released on GitHub with data, but not a pip-installable tool or live demo." 522 }, 523 "brand_recognition": { 524 "score": 1, 525 "justification": "Academic authors from Peking University and UCL; Mark Harman is well-known in software testing but not a household name." 526 } 527 } 528 }