scan.json (30080B)
1 { 2 "paper": { 3 "title": "The Prompt Alchemist: Automated LLM-Tailored Prompt Optimization for Test Case Generation", 4 "authors": [ 5 "Shuzheng Gao", 6 "Chaozheng Wang", 7 "Cuiyun Gao", 8 "Xiaoqian Jiao", 9 "Chun Yong Chong", 10 "Shan Gao", 11 "Michael R. Lyu" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2501.01329", 16 "doi": "10.48550/arXiv.2501.01329" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "MAPS, an automated prompt optimization method for LLM-based test case generation, outperforms four state-of-the-art prompt optimization baselines by 6.19% line coverage and 5.03% branch coverage on the Defects4J benchmark across ChatGPT, Llama-3.1, and Qwen2. The method demonstrates that different LLMs benefit from different optimized prompts, with each model achieving best performance on its own tailored prompt. An ablation study shows all three modules (diversity-guided prompt generation, failure-driven rule induction, domain contextual knowledge extraction) contribute meaningfully, with contextual knowledge extraction providing the largest individual gain.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "A replication package is provided on Zenodo at https://zenodo.org/records/14287744, referenced in Section VIII and throughout the paper (reference [29])." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The evaluation uses the publicly available Defects4J benchmark. The authors state 'We present the sampled development set Ddev in our replication package' (Section IV-D) along with seed prompts and prompt templates." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Only hardware is mentioned: 'Ubuntu 20.04 server with a 112-core Intel Xeon Platinum CPU' (Section IV-D). No software environment specifications (requirements.txt, dependency versions, Python version, etc.) are provided in the paper." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper references a replication package on Zenodo but does not provide step-by-step reproduction instructions within the paper itself. No README with commands, no 'Reproducing Results' section." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": true, 49 "justification": "Standard deviations are reported in parentheses for MAPS results across all tables (e.g., '53.80 (0.04)' in Table IV). This provides uncertainty information." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported anywhere in the paper. Claims like 'outperforms baseline methods by a large margin' are based solely on comparing point estimates." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Effect sizes are reported as absolute percentage differences with baseline context throughout (e.g., '6.19% higher line coverage rate' over the strongest baseline, with both baseline and MAPS scores visible in Tables IV-VI)." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification is given for why 10 bugs were chosen as the development set, or why 147 bugs from 5 projects constitute a sufficient test set. No power analysis is provided." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": true, 69 "justification": "'We repeat MAPS three times and report its average results and variance to eliminate the influence of sampling and fluctuations in LLM' (Section IV-D). Standard deviations are shown in Tables IV-VI." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Four prompt optimization baselines are compared: APE, OPRO, EVOPROMPT (GA), and EVOPROMPT (DE), plus a Basic prompt baseline (Section IV-C)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include APE (ICLR 2023), OPRO (ICLR 2024), and EVOPROMPT (ICLR 2024), all contemporary prompt optimization methods at the time of writing." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table VIII presents a comprehensive ablation study removing each of the three modules individually and testing only the context extraction module, across all three LLMs (RQ3, Section V-C)." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Two evaluation metrics are used: line coverage and branch coverage, reported for all experiments (Section IV-B)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "Evaluation is entirely automated using coverage metrics. No human evaluation of test case quality, readability, or usefulness is performed." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "The test set includes the development set: 'we randomly sample ten bugs from the Defects4J benchmark as our development set Ddev and use all bugs as test set Dtest' (Section IV-D). The 10 dev bugs used for prompt optimization are included in the 147 test bugs." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Per-project breakdowns are provided for all five Defects4J projects (Chart, Cli, Csv, Gson, Lang) in Tables IV-VI, showing meaningful performance variation across projects." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section VI-A presents two case studies showing how MAPS addresses specific failure patterns (exception handling in Listing 1, abstract class instantiation in Listing 2). Motivating examples in Section II-B also discuss failure cases." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Baseline methods APE and OPRO sometimes perform worse than the basic prompt (Table IV: APE achieves 44.58 vs Basic 45.56 line coverage). The ablation study shows which removals hurt most. RQ4 shows diminishing returns with some parameter changes." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims '6.19% higher line coverage rate and a 5.03% higher branch coverage rate on average' compared to the strongest baseline. Tables IV-VI show MAPS achieving 53.80, 50.59, 45.51 line coverage vs EVOPROMPT(GA) at 46.63, 46.52, 38.17, yielding improvements of 7.17, 4.07, 7.34 averaging ~6.19%." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims about module contributions ('removing X reduces performance by Y%') are supported by a controlled ablation study (Table VIII) where single components are removed while holding others constant. This is adequate single-variable manipulation." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'Automated LLM-Tailored Prompt Optimization for Test Case Generation' broadly, but experiments are limited to Java (Defects4J). The paper claims 'our method is language-agnostic and can be easily adapted to other programming languages' (Section VI-C) without evidence. The claim of language-agnostic generalizability is unbounded." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "Section VI-C discusses 'Limited LLMs' and 'Limited Programming Languages' as threats to validity, but these are scope limitations, not alternative explanations for the observed results. No confounds are discussed (e.g., whether the improvement comes from the additional context vs. the optimization, or whether the dev/test overlap biases results)." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures line coverage and branch coverage and frames results in terms of these same metrics without inflating to broader claims about test quality or software reliability. The measurements match the granularity of claims." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Specific model versions are provided: 'gpt-3.5-turbo-0125', 'Llama-3.1-70B-Instruct', and 'Qwen2-72B-Instruct' (Section IV-C)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Prompt templates for modification, reflection, and transformation are shown in Fig. 3. The paper states 'The seed prompts, and all prompt templates used in our work can be found in our replication package' (Section IV-D) with a Zenodo URL. Example final prompts are shown in Fig. 6." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "MAPS-specific parameters are reported (seed prompts=5, N=2, I=5), but LLM inference hyperparameters (temperature, top-p, max tokens) are not reported for any of the three models used. These significantly affect LLM output." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The iterative optimization pipeline is described in detail with Algorithms 1-2, Fig. 1 workflow diagram, and separate subsections for each module (Sections III-B through III-D). The three-module architecture with failure feedback loops constitutes scaffolding and is well documented." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper documents how focal methods are extracted from Defects4J projects, how in-file and cross-file context is gathered (Section III-B), how seed prompts are obtained, and how the development set is sampled (Section IV-D). Table III provides dataset statistics." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section VI-C 'Threats to Validity' provides a dedicated subsection discussing two main threats." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "The threats are specific to this study: 'some models are not covered' (limited LLMs) and 'Defects4J benchmark, which only contains Java projects' (limited programming languages). These identify concrete gaps in the evaluation." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "While the limitations section mentions Java-only and three LLMs, it then claims 'MAPS is model-agnostic' and 'language-agnostic and can be easily adapted to other programming languages' without evidence. The paper does not clearly state what the results do NOT show — it frames limitations as future work rather than boundaries." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "A replication package is on Zenodo, but the paper does not confirm that raw per-bug coverage results are included. Only aggregate results (project-level averages) are shown in the paper." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Data collection is described: Defects4J benchmark with five Java projects, specific bug versions listed in Table III, focal method extraction procedure described in Section III-B." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data source is the standard Defects4J benchmark." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline from benchmark selection → focal method extraction → context extraction → prompt optimization → test generation → coverage evaluation is documented across Sections III and IV. Table III shows dataset statistics at each level (bugs, focal classes, focal methods)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding sources, grants, or acknowledgments section is visible in the paper text. Funding is not disclosed." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: The Chinese University of Hong Kong, Harbin Institute of Technology (Shenzhen), Monash University Malaysia, and an independent researcher." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding source is disclosed, making it impossible to assess funder independence. The absence of funding disclosure is a transparency gap." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for any of the three models (gpt-3.5-turbo-0125, Llama-3.1-70B-Instruct, Qwen2-72B-Instruct) used in experiments." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "Defects4J is a widely-used benchmark published in 2014 and extensively discussed online. No analysis of whether test examples appear in any model's training data is provided." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "Defects4J has been publicly available since 2014 and is a standard benchmark in software engineering research. All three models were trained well after its publication. The contamination risk is not discussed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. Evaluation is entirely automated benchmark-based." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference costs, API costs, token counts, or wall-clock times are reported. MAPS requires multiple iterative LLM calls for optimization (5 iterations × multiple prompt evaluations) but the cost is never quantified." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Only hardware is mentioned ('Ubuntu 20.04 server with a 112-core Intel Xeon Platinum CPU'). No total compute budget (GPU hours, API spend, total wall-clock time) is reported." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": true, 304 "justification": "'We repeat MAPS three times and report its average results and variance' (Section IV-D). Standard deviations across runs are shown in Tables IV-VI." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "The paper explicitly states 'We repeat MAPS three times' (Section IV-D). However, baseline methods appear to be run only once." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "RQ4 (Section V-D) studies parameter sensitivity by varying seed prompts (3-6), N (1-3), and iterations (1-3), but the total computational budget for this search is not reported." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "Section V-D provides justification for parameter choices: 'the improvements over five seed prompts are marginal. Therefore, we set the number of seed prompts to five.' Similar justification for N=2 and I=5." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "Multiple comparisons are made across 3 LLMs × 5 projects × 2 metrics, but no significance tests are performed at all, let alone corrections for multiple comparisons." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors implement all baseline methods and compare against their own system. No discussion of author-evaluation bias or whether baselines were given comparable tuning effort." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "MAPS requires iterative optimization with multiple LLM calls per iteration (prompt generation, evaluation, reflection, rule induction) while baselines use simpler optimization. This compute difference is not quantified or discussed." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "Line coverage and branch coverage are used as sole quality measures without discussing whether coverage actually captures test effectiveness, fault detection, or practical test utility. No discussion of coverage metric limitations." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "All methods are compared using the same underlying LLMs. The ablation study (Table VIII) separates the contributions of MAPS's three modules, addressing what comes from the optimization scaffold vs. the context extraction." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Defects4J bugs date from 2006-2016. All models (GPT-3.5, Llama-3.1, Qwen2) were trained well after these dates. The models may have seen Defects4J bugs and their fixes during training. Not discussed." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "MAPS provides cross-file context (subclass information, constructor signatures) as features. Whether this constitutes information that would be available in a realistic usage scenario is not discussed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The development set (10 bugs) is a subset of the test set (all 147 bugs). This overlap between optimization and evaluation data is not discussed as a leakage concern." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention methods are employed (no canary strings, no membership inference, no temporal splits, no decontamination)." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "MAPS outperforms baseline prompt optimization methods by 6.19% higher line coverage and 5.03% higher branch coverage on average across three LLMs.", 373 "evidence": "Tables IV-VI show MAPS achieving average line coverage of 53.80/50.59/45.51 vs strongest baselines at 46.63/46.52/39.41 across ChatGPT/Llama-3.1/Qwen2 on Defects4J (Section V-A).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "MAPS effectively generates LLM-tailored prompts, with each model achieving best performance on its own optimized prompt.", 378 "evidence": "Table VII shows each model achieves highest line coverage with its own prompt: ChatGPT 53.80% vs 51.35%/51.14%, Llama-3.1 50.59% vs 41.92%/43.98%, Qwen2 45.51% vs 35.98%/44.94% (Section V-B).", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "All three modules (diversity-guided prompt generation, failure-driven rule induction, domain contextual knowledge extraction) contribute to MAPS performance.", 383 "evidence": "Table VIII ablation study shows removing any module degrades performance across all three LLMs. Removing contextual knowledge causes the largest drop (e.g., -9.64% line coverage on ChatGPT) (Section V-C).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "MAPS generates more diverse prompts than baseline methods during optimization.", 388 "evidence": "Section VI-A reports average edit distance of 27.0 for MAPS vs 9.3 for OPRO, but this is measured only against OPRO, not all baselines, and on a single metric.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "Existing prompt optimization methods struggle to produce effective prompts for test case generation.", 393 "evidence": "Tables IV-VI show APE and OPRO sometimes performing worse than the basic prompt (e.g., APE achieves 44.58% vs Basic 45.56% line coverage on ChatGPT), with best baseline improving only 1.07% over basic (Section V-A).", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "Dev/test overlap", 400 "detail": "The development set (10 bugs used for prompt optimization) is a subset of the test set (all 147 bugs). This means prompts were optimized partially on data included in the final evaluation, inflating reported performance." 401 }, 402 { 403 "flag": "No statistical significance tests", 404 "detail": "All claims of outperformance are based on comparing point estimates without any statistical testing (no p-values, t-tests, or confidence intervals on differences). With only 3 runs of MAPS, the observed differences may not be statistically significant." 405 }, 406 { 407 "flag": "Unaddressed benchmark contamination", 408 "detail": "Defects4J has been publicly available since 2014 and is one of the most widely-used Java benchmarks. All three evaluated models were trained well after its publication, creating substantial contamination risk that is never discussed." 409 }, 410 { 411 "flag": "No cost comparison with baselines", 412 "detail": "MAPS requires multiple iterative LLM calls (prompt generation, evaluation, reflection, rule induction, rule validation) across 5 iterations. The computational cost is never reported or compared to simpler baselines, making practical applicability unclear." 413 }, 414 { 415 "flag": "Baselines not given equal treatment", 416 "detail": "MAPS is run 3 times with variance reported, but baseline methods appear to be run only once (no standard deviations shown for baselines in Tables IV-VI). This asymmetry makes comparisons less rigorous." 417 }, 418 { 419 "flag": "Small development set", 420 "detail": "Only 10 bugs (out of 147) are used as the development set for prompt optimization. The representativeness of this small sample for optimizing prompts is not justified or analyzed." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Unit test case generation with transformers and focal context", 426 "authors": ["M. Tufano", "D. Drain", "A. Svyatkovskiy", "S. K. Deng", "N. Sundaresan"], 427 "year": 2020, 428 "arxiv_id": "2009.05617", 429 "relevance": "Pioneering deep learning approach for test case generation using transformers, directly relevant to LLM-based code generation evaluation." 430 }, 431 { 432 "title": "A3test: Assertion-augmented automated test case generation", 433 "authors": ["S. Alagarsamy", "C. Tantithamthavorn", "A. Aleti"], 434 "year": 2023, 435 "relevance": "State-of-the-art non-LLM deep learning baseline for test case generation, used as comparison in this paper." 436 }, 437 { 438 "title": "Evaluating and improving chatgpt for unit test generation", 439 "authors": ["Z. Yuan", "M. Liu", "S. Ding", "K. Wang", "Y. Chen", "X. Peng", "Y. Lou"], 440 "year": 2024, 441 "relevance": "Evaluates ChatGPT for test case generation with iterative refinement; directly studies LLM capability in software testing." 442 }, 443 { 444 "title": "Connecting large language models with evolutionary algorithms yields powerful prompt optimizers", 445 "authors": ["Q. Guo", "R. Wang", "J. Guo", "B. Li", "K. Song", "X. Tan", "G. Liu", "J. Bian", "Y. Yang"], 446 "year": 2024, 447 "relevance": "State-of-the-art prompt optimization method (EVOPROMPT) using evolutionary algorithms, key baseline in this paper." 448 }, 449 { 450 "title": "Large language models are human-level prompt engineers", 451 "authors": ["Y. Zhou", "A. I. Muresanu", "Z. Han", "K. Paster", "S. Pitis", "H. Chan", "J. Ba"], 452 "year": 2023, 453 "relevance": "Foundational work on automated prompt engineering (APE), baseline method and key related work." 454 }, 455 { 456 "title": "Large language models as optimizers", 457 "authors": ["C. Yang", "X. Wang", "Y. Lu", "H. Liu", "Q. V. Le", "D. Zhou", "X. Chen"], 458 "year": 2024, 459 "relevance": "OPRO method using LLMs for optimization including prompt optimization, key baseline in this paper." 460 }, 461 { 462 "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models", 463 "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"], 464 "year": 2023, 465 "relevance": "Combines LLMs with search-based testing to improve coverage, relevant to LLM-assisted test generation." 466 }, 467 { 468 "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM", 469 "authors": ["G. Ryan", "S. Jain", "M. Shang", "S. Wang", "X. Ma", "M. K. Ramanathan", "B. Ray"], 470 "year": 2024, 471 "relevance": "Studies coverage-guided LLM prompting for test generation, directly relevant to prompt engineering for testing." 472 }, 473 { 474 "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt", 475 "authors": ["C. S. Xia", "L. Zhang"], 476 "year": 2023, 477 "arxiv_id": "2304.00385", 478 "relevance": "CHATRepair uses iterative LLM interaction for program repair, relevant to LLM-based software engineering capabilities." 479 }, 480 { 481 "title": "Software testing with large language models: Survey, landscape, and vision", 482 "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"], 483 "year": 2024, 484 "relevance": "Comprehensive survey of LLM-based software testing, directly relevant to the survey scope." 485 }, 486 { 487 "title": "Search-based llms for code optimization", 488 "authors": ["S. Gao", "C. Gao", "W. Gu", "M. Lyu"], 489 "year": 2024, 490 "relevance": "Combines search-based methods with LLMs for code intelligence, relevant to LLM optimization approaches." 491 }, 492 { 493 "title": "Self-edit: Fault-aware code editor for code generation", 494 "authors": ["K. Zhang", "Z. Li", "J. Li", "G. Li", "Z. Jin"], 495 "year": 2023, 496 "relevance": "Uses compiler error messages to improve LLM code generation, relevant to feedback-driven LLM coding approaches." 497 } 498 ], 499 "engagement_factors": { 500 "practical_relevance": { 501 "score": 2, 502 "justification": "Prompt optimization for test generation has clear practitioner value, though the iterative optimization pipeline requires significant setup and compute." 503 }, 504 "surprise_contrarian": { 505 "score": 1, 506 "justification": "The finding that different LLMs need different prompts is somewhat expected; the contribution is in automating the optimization rather than a surprising finding." 507 }, 508 "fear_safety": { 509 "score": 0, 510 "justification": "No AI safety, security, or risk concerns raised by this work." 511 }, 512 "drama_conflict": { 513 "score": 0, 514 "justification": "No controversial claims or conflicts; straightforward method paper." 515 }, 516 "demo_ability": { 517 "score": 1, 518 "justification": "Replication package on Zenodo is available but it is not a polished pip-installable tool or live demo." 519 }, 520 "brand_recognition": { 521 "score": 1, 522 "justification": "Uses well-known models (ChatGPT, Llama) but the paper and method themselves are not from a famous AI lab." 523 } 524 } 525 }