scan.json (30488B)
1 { 2 "paper": { 3 "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing", 4 "authors": [ 5 "Zejun Wang", 6 "Kaibo Liu", 7 "Ge Li", 8 "Zhi Jin" 9 ], 10 "year": 2024, 11 "venue": "International Conference on Automated Software Engineering", 12 "arxiv_id": "2408.11324", 13 "doi": "10.1145/3691620.3695501" 14 }, 15 "scan_version": 3, 16 "active_modules": [ 17 "experimental_rigor", 18 "data_leakage" 19 ], 20 "methodology_tags": [ 21 "benchmark-eval" 22 ], 23 "key_findings": "HITS decomposes complex Java methods into slices and generates unit tests per slice using an LLM, achieving 55.09% average line coverage and 48.12% branch coverage on complex methods—outperforming ChatUniTest (32.48%/27.07%), ChatTester (20.71%/18.20%), SymPrompt (26.32%/25.10%), and Evosuite (39.10%/38.46%). Notably, existing LLM-based baselines perform worse than the traditional SBST tool Evosuite on complex methods, reversing the advantage seen on average-complexity methods. The ablation study confirms that the slicing decomposition is the primary contributor to improvements.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper provides an anonymous artifact link: https://anonymous.4open.science/r/SlicePromptTest4J-6CF1/. While an anonymous review link, it constitutes a provided URL." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The dataset comprises complex methods extracted from 10 open-source projects (Table 2), all publicly available. The artifact link likely includes the dataset specification. The base projects (Commons-CLI, Gson, etc.) are all public." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions gpt-turbo-3.5-0125, Java, JUnit 5, Jacoco, and Evosuite but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous artifact link may contain a README, but the paper itself does not describe how to replicate the experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results in Tables 4-7 are point estimates with no confidence intervals or error bars." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The abstract claims HITS 'significantly outperforms' baselines, but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere. Comparisons are based solely on comparing raw numbers." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper reports absolute coverage scores with baseline comparisons (e.g., HITS 55.09% vs ChatUniTest 32.48% line coverage) and describes improvements as 'a percentage ranging from 10 to 20,' providing enough context to assess effect magnitude." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The dataset contains 120 complex methods from 10 projects. No power analysis or sample size justification is provided. Budget constraints are mentioned as a limitation but not as a justification." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Greedy generation is used (deterministic single-run), so no variance across runs is reported. No standard deviation, IQR, or spread measures appear in any table." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Four baselines are included: ChatUniTest, ChatTester, SymPrompt (LLM-based), and Evosuite (SBST). Results are compared across all in Tables 4-6." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "ChatUniTest (2023), ChatTester (2023), and SymPrompt (2024) are recent LLM-based methods. Evosuite is an established and widely-used SBST baseline." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 7 presents an ablation study with two configurations: 'w/o slicing' and 'w/o slicing & PE' (equivalent to ChatUniTest + PP), measuring the contribution of slicing, prompt engineering, and post-processing." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Three metrics are reported: line coverage (Table 4), branch coverage (Table 5), and pass rate (Table 6)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation of generated test quality is performed. All evaluation is automated via Jacoco coverage measurement and execution correctness checking." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": false, 103 "justification": "There is no explicit separation between prompt development data and evaluation data. The prompts and examples were manually crafted, and it is unclear whether any of the 10 evaluation projects were used during prompt development." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Tables 4-7 provide per-project breakdowns for all 10 projects, and Table 2 splits projects into 'Learned' (in training data) and not learned." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 5.1 provides a case study showing where ChatUniTest fails and how HITS addresses it. Section 5.3 analyzes the distribution of non-executable tests (compilation vs runtime errors)." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper reports that Evosuite outperforms all LLM-based methods on several specific projects (e.g., COD, DAT). The ablation study shows the slicing workflow hurts branch coverage on a few projects (COL, JDO). Section 4.4 analyzes why." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims HITS 'significantly outperforms current test case generation methods with LLMs and the typical SBST method Evosuite regarding both line and branch coverage scores.' Tables 4 and 5 show HITS averaging 55.09%/48.12% vs next best Evosuite at 39.10%/38.46%." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper claims slicing 'improves' coverage (causal language). The ablation study (Table 7) removes slicing while keeping other components, measuring the isolated effect. This controlled single-variable manipulation is adequate for the causal claim." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title ('LLM-based Unit Test Generation') implies broad applicability, but results are limited to 10 Java projects tested with a single LLM (gpt-3.5-turbo). The method relies on Java-specific tooling (JUnit 5, Mockito, Jacoco). The threats section acknowledges limited LLM generalization but the title and framing remain broad." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "Section 5.4 discusses threats (limited dataset, limited LLMs) but these are methodological limitations, not alternative explanations for the results. The paper does not discuss confounds like whether improvements come from longer prompts providing more context, or whether the chain-of-thought approach rather than slicing per se drives the gains." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper measures line and branch coverage as the primary outcomes and implicitly equates these with test quality. No discussion of the gap between coverage and actual fault detection ability, mutation score, or other measures of test effectiveness." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper specifies 'gpt-turbo-3.5-0125' as the model, which includes the specific version/snapshot identifier." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": false, 157 "justification": "Figure 2 shows prompt templates with structural placeholders ({{ focal method }}, {{ dependencies }}) and abbreviated examples ('... (the example)'). The actual filled prompts and hand-crafted examples referenced in the prompts are not fully provided." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper states 'greedy generation first' and 'raise the top-p slowly and gradually' on format violations, but does not specify the actual top-p values used. Max fixing rounds (10) are stated, but other API parameters (max tokens, etc.) are not reported." 163 }, 164 "scaffolding_described": { 165 "applies": true, 166 "answer": true, 167 "justification": "The multi-step pipeline is described in detail: static analysis for context retrieval, chain-of-thought method decomposition into slices, per-slice test generation with scenario analysis, self-debug for broken tests, and post-processing (format extraction, test isolation, rule-based fixing). Figure 2 provides an overview diagram." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 4.1 describes the dataset construction: projects selected by domain overlap with prior work, 1-30 complex methods, high GitHub stars. Complex methods defined as cyclomatic complexity > 10. Table 2 shows per-project statistics." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.4 'Threats to Validity' provides a dedicated discussion of limitations." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 5.4 identifies specific threats: limited to projects with ≤30 complex methods due to budget, gpt-3.5-turbo chosen for cost/capability tradeoff, and notes that 'other LLMs are either incapable of understanding the instructions of HITS to generate or are much more expensive.'" 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not explicitly state what the results do NOT show. The threats section mentions limitations (limited dataset, limited LLMs) but doesn't draw explicit scope boundaries like 'these results do not generalize to non-Java languages' or 'we do not claim fault detection effectiveness.'" 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "While an anonymous artifact link is provided, the paper does not explicitly state that raw experimental data (individual test outputs, coverage reports per method, LLM responses) are available for independent verification." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 4.1 describes three project selection standards: (1) domain used in Evosuite/ChatUniTest, (2) contains 1-30 complex methods, (3) prioritize high-star GitHub projects. Complex methods defined via cyclomatic complexity > 10." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "The sample selection process is described: projects crawled from the Internet following three explicit standards (domain overlap, method count range, GitHub stars). Table 2 provides per-project metadata including domain and version." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": false, 211 "justification": "The paper describes the selection criteria but does not document the full pipeline from initial candidate pool to final dataset. There's no accounting of how many candidate projects were considered, how many were filtered at each stage, or the total methods before complexity filtering." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding or acknowledgments section is present in the paper. The authors mention 'limited budget of the experiment fund' in Section 5.4, implying funding exists but the source is not disclosed." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All authors are listed as affiliated with Key Lab of HCST (PKU), MOE; SCS, Beijing, China. No conflict with evaluated products (they use OpenAI API but don't work at OpenAI)." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "Since funding is not disclosed (despite mentioning an 'experiment fund'), independence of the funder cannot be assessed." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The paper references 'the cutoff date of the training data of gpt-turbo-3.5' and splits projects by learned/not-learned, but never states the actual cutoff date." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Table 2 explicitly marks which projects were 'Learned' (created before training cutoff) and which were not. The paper discusses how projects in the training set may allow the LLM to 'recall' tests from memory (Section 4.4)." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "The paper designs the experiment around contamination: 6 projects in training data, 4 after cutoff. Results are reported separately. Section 4.4 discusses how 'Commons-collections is more broadly used' and the LLM may rely on training data recall rather than following instructions." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. It is a benchmark evaluation of automated test generation tools." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No API costs, tokens consumed, or wall-clock time are reported despite the method making many LLM calls per focal method (decomposition + per-slice generation + fixing rounds)." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The total computational budget is not quantified. The paper mentions 'limited budget of the experiment fund' but provides no figures for total API spend, tokens, or compute hours." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Greedy generation is used (deterministic), so only a single run is performed. No analysis of sensitivity to random seeds or non-deterministic settings." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper states 'greedy generation first' implying a single run, but does not explicitly state the number of experimental runs. When top-p is raised for format violations, the number of retries is not specified." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is reported. The prompts and examples were manually crafted, but no search budget or number of configurations tried is stated." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper presents a single configuration without explaining how it was selected or whether alternatives were tried. Prompt design choices appear final without justification for selection." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical tests are performed, so correction for multiple comparisons is not applicable." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors re-implemented SymPrompt ('We implement SymPrompt since we have found no implementations') but do not acknowledge the bias of evaluating their own implementation of a baseline. ChatUniTest and ChatTester use open-source implementations." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Table 3 shows test case counts but not compute costs. HITS makes multiple LLM calls per method (decomposition + per-slice generation + fixing) while baselines use different strategies. No performance-vs-compute analysis is provided." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper uses line and branch coverage as the sole evaluation criteria without discussing whether coverage actually measures test effectiveness, fault detection ability, or test quality. The proxy gap between coverage and real-world test value is not addressed." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": true, 345 "answer": true, 346 "justification": "All LLM-based methods use the same model (gpt-turbo-3.5-0125), isolating the scaffolding/prompting approach as the experimental variable. The comparison explicitly tests different scaffolding strategies under a controlled model." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The paper splits projects into 6 'Learned' (before training cutoff) and 4 'Not Learned' (after cutoff) and reports results for both groups, directly addressing temporal leakage." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks information through context. The test generation task provides source code as input (standard practice), but potential leakage through project-level context is not discussed." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of whether methods within the same project share structural similarities that could inflate results, or whether training data for the LLM includes these specific projects' test suites." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": true, 368 "justification": "Temporal splitting is used as a concrete prevention method: projects are categorized by whether they were created before or after the model's training data cutoff date." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "HITS outperforms all baselines on both line and branch coverage scores for complex methods.", 375 "evidence": "Table 4: HITS 55.09% avg line coverage vs ChatUniTest 32.48%, ChatTester 20.71%, SymPrompt 26.32%, Evosuite 39.10%. Table 5: HITS 48.12% avg branch coverage vs ChatUniTest 27.07%, ChatTester 18.20%, SymPrompt 25.10%, Evosuite 38.46%.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Existing LLM-based test generation methods are inferior to the SBST method Evosuite when testing complex methods.", 380 "evidence": "Tables 4-5: Evosuite (39.10%/38.46%) outperforms ChatUniTest (32.48%/27.07%), ChatTester (20.71%/18.20%), and SymPrompt (26.32%/25.10%) on average, contrary to prior claims of LLM superiority on average-complexity methods.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The slicing decomposition is the primary contributor to HITS's coverage improvements.", 385 "evidence": "Table 7 ablation: HITS 55.09%/48.12% vs w/o slicing 50.61%/42.97% vs w/o slicing & PE (ChatUniTest+PP) 48.56%/42.38%. Slicing alone adds ~4.5pp line and ~5pp branch coverage on average.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "HITS has a higher pass rate (proportion of executable tests) than all LLM-based baselines.", 390 "evidence": "Table 6: HITS 69.11% average pass rate vs ChatUniTest 41.82%, SymPrompt 21.00%, ChatTester 16.10%.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Generating tests slice by slice reduces the input condition combinations from multiplicative to additive, simplifying the LLM's analysis task.", 395 "evidence": "Section 3.2 provides the theoretical argument: for n slices with ni conditions each, slice-by-slice inference requires Σni combinations vs Πni for whole-method inference. The case study (Section 5.1, Figure 3) illustrates this concretely.", 396 "supported": "weak" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No statistical significance tests", 402 "detail": "The abstract claims HITS 'significantly outperforms' baselines, but no statistical tests are performed. All comparisons are raw number comparisons across 10 projects with no p-values, t-tests, or bootstrap tests." 403 }, 404 { 405 "flag": "Small dataset with selection bias", 406 "detail": "Only 10 projects with 120 complex methods, selected to have 1-30 complex methods (excluding larger projects due to budget). This selection criterion may bias toward smaller, simpler projects." 407 }, 408 { 409 "flag": "Single-LLM evaluation", 410 "detail": "Only gpt-3.5-turbo is evaluated. The paper acknowledges 'other LLMs are either incapable of understanding the instructions of HITS to generate or are much more expensive,' suggesting the method may not generalize." 411 }, 412 { 413 "flag": "Self-implemented baseline (SymPrompt)", 414 "detail": "The authors re-implemented SymPrompt since no public implementation exists. This introduces potential bias, as authors' implementations of baselines systematically underperform (Lucic et al., 2018)." 415 }, 416 { 417 "flag": "Single deterministic run with no variance reporting", 418 "detail": "Greedy generation produces a single deterministic result per configuration. No multiple runs, no variance, no error bars. The stability of results is unknown." 419 }, 420 { 421 "flag": "No cost reporting despite multi-call pipeline", 422 "detail": "HITS makes multiple LLM calls per method (decomposition + per-slice generation + up to 10 fixing rounds per broken test) but reports no API costs, making practical cost comparison with baselines impossible." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool", 428 "authors": ["Zhuokui Xie", "Yinghao Chen", "Chen Zhi", "Shuiguang Deng", "Jianwei Yin"], 429 "year": 2023, 430 "arxiv_id": "2305.04764", 431 "relevance": "Primary baseline and inspiration for HITS; practical LLM-based Java unit test generation tool." 432 }, 433 { 434 "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation", 435 "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu", "Shiji Ding", "Kaixin Wang", "Yixuan Chen", "Xin Peng"], 436 "year": 2023, 437 "arxiv_id": "2305.04207", 438 "relevance": "ChatTester baseline; evaluates ChatGPT for unit test generation showing LLMs outperform SBST on average methods." 439 }, 440 { 441 "title": "Code-Aware Prompting: A study of Coverage Guided Test Generation in Regression Setting using LLM", 442 "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang", "Xiaofei Ma", "Murali Krishna Ramanathan", "Baishakhi Ray"], 443 "year": 2024, 444 "arxiv_id": "2402.00097", 445 "relevance": "SymPrompt baseline; uses execution paths as scaffolds for LLM-based test generation." 446 }, 447 { 448 "title": "Evolutionary Generation of Whole Test Suites", 449 "authors": ["Gordon Fraser", "Andrea Arcuri"], 450 "year": 2011, 451 "doi": "10.1109/QSIC.2011.19", 452 "relevance": "Evosuite SBST baseline; the standard search-based test generation tool compared against LLM methods." 453 }, 454 { 455 "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models", 456 "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"], 457 "year": 2023, 458 "relevance": "Hybrid SBST+LLM approach that calls LLM when traditional tools plateau, relevant to LLM-assisted test generation." 459 }, 460 { 461 "title": "An empirical evaluation of using large language models for automated unit test generation", 462 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 463 "year": 2023, 464 "relevance": "Empirical study of LLMs for unit test generation, establishing baseline expectations for LLM test quality." 465 }, 466 { 467 "title": "Software testing with large language models: Survey, landscape, and vision", 468 "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen", "Zhe Liu", "Song Wang", "Qing Wang"], 469 "year": 2024, 470 "relevance": "Comprehensive survey of LLM-based software testing methods and landscape." 471 }, 472 { 473 "title": "Teaching Large Language Models to Self-Debug", 474 "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"], 475 "year": 2024, 476 "relevance": "Self-Debug technique used by HITS to fix broken generated tests via LLM self-repair." 477 }, 478 { 479 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 480 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 481 "year": 2023, 482 "relevance": "Self-Refine technique used by HITS for prompt optimization (Section 5.2)." 483 }, 484 { 485 "title": "Evaluating large language models trained on code", 486 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 487 "year": 2021, 488 "arxiv_id": "2107.03374", 489 "relevance": "Codex paper establishing foundational evaluation of LLMs for code generation." 490 }, 491 { 492 "title": "Unit test case generation with transformers and focal context", 493 "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy", "Shao Kun Deng", "Neel Sundaresan"], 494 "year": 2020, 495 "relevance": "ATHENATEST — early deep learning approach to unit test generation that outperformed Evosuite on some tasks." 496 }, 497 { 498 "title": "ChatGPT vs SBST: A comparative assessment of unit test suite generation", 499 "authors": ["Yutian Tang", "Zhijie Liu", "Zhichao Zhou", "Xiapu Luo"], 500 "year": 2024, 501 "relevance": "Direct comparison of ChatGPT vs search-based testing, relevant to the LLM-vs-SBST comparison in this paper." 502 }, 503 { 504 "title": "Feedback-Directed Random Test Generation", 505 "authors": ["Carlos Pacheco", "Shuvendu K. Lahiri", "Michael D. Ernst", "Thomas Ball"], 506 "year": 2007, 507 "doi": "10.1109/ICSE.2007.37", 508 "relevance": "Randoop — classic feedback-directed random test generation tool, foundational SBST baseline." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "A working Java test generation tool with an artifact link, but requires API calls to GPT-3.5 and is specific to complex methods." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "The finding that existing LLM tools perform worse than Evosuite on complex methods is mildly surprising, but the decomposition approach itself is expected." 519 }, 520 "fear_safety": { 521 "score": 0, 522 "justification": "No AI safety or security concerns raised by this work." 523 }, 524 "drama_conflict": { 525 "score": 0, 526 "justification": "No controversy or conflict; straightforward technical contribution." 527 }, 528 "demo_ability": { 529 "score": 1, 530 "justification": "Anonymous artifact link provided but unclear how easily the tool can be run by others." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Authors from PKU (reputable but not headline-grabbing); uses GPT-3.5 but is not about a famous product." 535 } 536 } 537 }