scan.json (26960B)
1 { 2 "paper": { 3 "title": "LLM for Test Script Generation and Migration: Challenges, Capabilities, and Opportunities", 4 "authors": [ 5 "Shengcheng Yu", 6 "Chunrong Fang", 7 "Yuchen Ling", 8 "Chentian Wu", 9 "Zhenyu Chen" 10 ], 11 "year": 2023, 12 "venue": "International Conference on Software Quality, Reliability and Security (QRS)", 13 "arxiv_id": "2309.13574", 14 "doi": "10.1109/QRS60937.2023.00029" 15 }, 16 "scan_version": 3, 17 "active_modules": [], 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No source code repository or archive is provided. The generated test scripts, prompts, and experimental artifacts are not released." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No datasets are released. The experimental subjects are publicly available apps (Outlook, QQ Mail, NetEase Mail, Fliggy, Ctrip, Mafengwo), but the actual prompts, ChatGPT responses, and generated scripts are not made available." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment specifications are provided. The paper mentions using the OpenAI API with gpt-3.5-turbo and Appium but provides no version numbers, dependency lists, or setup details." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No reproduction instructions are provided. The paper describes the general experimental approach but lacks specific steps, configuration files, or scripts needed to reproduce the experiments." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "No quantitative results are reported with confidence intervals or error bars. The paper's evaluation is entirely qualitative, with no numerical metrics reported for any experiment." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical tests are used. The paper makes claims about LLM capabilities based on qualitative observation of a handful of experiments with no statistical analysis." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "No effect sizes are reported. The paper provides no quantitative measurements of success rates, completion rates, or any other metric that could establish effect magnitude." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The sample consists of 6 mobile apps with approximately 9 experiments for RQ1, 3 for RQ2, and 6 for RQ3. No justification is given for why these numbers are sufficient to support the claims made." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper mentions 'Some prompts are repeated to see whether ChatGPT will generate responses with different scripts to eliminate the effect of randomness' (Section 3.2) but reports no variance or spread measures from these repeated runs." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": false, 72 "justification": "No baselines are included. The paper does not compare ChatGPT's performance against any existing test generation or migration tools (e.g., CraftDroid, AppTestMigrator) despite discussing them in the related work." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "No baselines are included at all, so contemporariness is moot. The paper discusses several contemporary tools in Section 6 (e.g., CraftDroid, AppTestMigrator) but never compares against them." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "The paper tests two approaches for RQ1 (direct prompting and dialogue-based), but does not perform systematic ablation of components. No controlled removal of individual features to measure their contribution." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "The paper mentions evaluating 'grammatical accuracy, semantic correctness, and practical applicability' (Section 4.1) but reports no quantitative metrics for any of these dimensions. All evaluation is qualitative narrative." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "The authors manually verify the generated scripts: 'their alignment with the predefined test operation process is manually confirmed' (Section 4.1), and scripts are executed on devices with manual checking of results across all three RQs." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "This is a qualitative case study with no training/testing split. There is no dataset or benchmark to hold out." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by RQ (scenario-based generation, cross-platform migration, cross-app migration) and further by individual scenarios (login, sending email, adding email account, flight search) and apps." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Multiple failure cases are discussed in detail: deprecated API usage, pop-up handling failures, input box focus issues (Section 4.1), context memory overflow causing failure on 'adding email account' scenario, and inability to self-correct scripts (Section 4.2)." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports several negative results: ChatGPT failed to self-terminate the testing process in the 'adding email account' scenario, failed to generate a relevant script in the second test of that scenario, and could not self-correct errors in cross-platform migration (Sections 4.1, 4.2)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract is relatively cautious, framing the work as an 'investigation' that 'aims to enhance software testing practices' and 'contribute to the understanding of LLMs' capabilities.' These aspirational/exploratory claims are broadly consistent with the qualitative findings reported." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper claims 'This approach significantly reduces manual intervention and enhances script generation efficiency' (Section 4.1) without any controlled comparison or quantitative measurement. No baseline is compared against to support the causal claim of reduction/enhancement." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title says 'LLM' broadly but only gpt-3.5-turbo is tested. Claims about 'LLMs' capabilities are generalized from a single model. The paper tests only 6 apps in 2 categories (email, travel) but draws broad conclusions about mobile app test script generation." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations for the observed results are discussed. For example, the paper does not consider whether the successes are due to the simplicity of the chosen scenarios (login is a well-documented flow) or whether similar results could be achieved with simpler template-based approaches." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper evaluates 'grammatical accuracy, semantic correctness, and practical applicability' of individual scripts but frames results as demonstrating LLM 'capabilities' for test automation broadly. The gap between manually checking a few scripts and claiming general test automation capability is not acknowledged." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper states 'we use the API provided by OpenAI with the gpt-3.5-turbo model' (Section 3.2) but provides no specific version or snapshot date. The gpt-3.5-turbo model changes over time." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": false, 151 "justification": "Prompt templates are provided for each phase (general prompt, initiation, exploration, summarization) but all contain placeholder values (e.g., '{}' for device names, package names, element IDs). The actual filled-in prompts used in experiments are not provided." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No API hyperparameters (temperature, top-p, max tokens, etc.) are reported for the gpt-3.5-turbo model calls." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The dialogue-based approach in RQ1 describes a three-phase scaffolding (initiation, exploration, summarization) with iterative feeding of XML page elements to ChatGPT and task-based decision making (Section 4.1). The workflow is described with prompt templates for each phase." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper describes manually acquiring 'comprehensive information of the whole testing process' (Section 4.1) but does not document how this information was extracted, filtered, or formatted before being used in prompts." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5.1 'Challenges' serves as a limitations section, discussing four specific limitations: Context Memory, API Usage Randomness, Human Effort Requirement, and Limited Supported Test Events." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "Section 5.1 discusses general limitations of LLMs (context memory, API randomness, human effort, limited test events) rather than specific threats to the validity of this study's findings. No discussion of threats like selection bias in app choice, non-representative scenarios, or generalizability limits of a single-model study." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of what populations/settings are excluded or what claims the authors are not making. The title implies broad LLM applicability despite testing only one model on six apps." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw data is available. The actual ChatGPT conversation logs, generated scripts, and execution results are not released for independent verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The paper describes the general approach (manually acquiring app information, constructing prompts, running experiments) but does not provide detailed data collection procedures such as exact dates, specific app versions tested, or complete interaction logs." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": false, 200 "justification": "App selection is described vaguely: 'carefully selected based on their popularity, usability, and the varying complexity of their features' (Section 3.3). No specific criteria, popularity metrics, or systematic selection process is documented." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "No data pipeline documentation. The paper goes from describing the experimental setup to reporting qualitative observations without documenting intermediate steps, how many prompts were tried, or how observations were systematically recorded." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Funding is disclosed in the Acknowledgment section: National Natural Science Foundation of China (62141215, 62272220, 62372228), Science, Technology and Innovation Commission of Shenzhen Municipality, and National Undergraduate Training Program for Innovation and Entrepreneurship." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors are affiliated with Nanjing University, State Key Laboratory for Novel Software Technology. They evaluate a third-party product (ChatGPT by OpenAI) with no apparent affiliation conflict." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "Funders are Chinese government research agencies and university programs with no financial stake in whether ChatGPT performs well at test script generation." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It is a qualitative case study testing ChatGPT's ability to generate test scripts for specific mobile apps in real time." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No benchmark evaluation is performed. The experimental subjects are live mobile apps where ChatGPT generates scripts interactively, not a fixed test set." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "No benchmark is used. The evaluation is qualitative, based on manual verification of generated scripts against specific mobile app scenarios." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. The experimental subjects are mobile apps and LLM-generated test scripts." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No API costs, token counts, or latency measurements are reported for any of the ChatGPT interactions across the experiments." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No computational budget, hardware specifications, or total API spend is mentioned." 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "LLMs have a strong capability in test script generation and migration tasks of different complexities when provided with sufficient information.", 300 "evidence": "Qualitative observations across three RQs showing ChatGPT can generate syntactically correct Appium scripts for login, email, and flight booking scenarios on 6 apps (Sections 4.1-4.3). No quantitative success rates or metrics provided.", 301 "supported": "weak" 302 }, 303 { 304 "claim": "ChatGPT can generate grammatically and syntactically correct Appium test scripts from detailed scenario descriptions.", 305 "evidence": "In RQ1, 'All scripts are both grammatically and syntactically correct, and their alignment with the predefined test operation process is manually confirmed' (Section 4.1). Nine experiments across three apps with varying complexity.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "The dialogue-based approach significantly reduces manual intervention and enhances script generation efficiency compared to providing all information at once.", 310 "evidence": "The dialogue-based approach is described and tested on two scenarios (login, adding email account) on NetEase Mail (Section 4.1). No quantitative comparison of effort or efficiency between the two approaches.", 311 "supported": "weak" 312 }, 313 { 314 "claim": "Context memory limitations cause ChatGPT to fail on complex scenarios with many GUI elements.", 315 "evidence": "In the 'adding email account' scenario, ChatGPT 'loses track of the prior context and fails to remember the original task' when the page contained excessive clickable elements (Section 4.1). Demonstrated in two test instances.", 316 "supported": "moderate" 317 }, 318 { 319 "claim": "Cross-app test script migration essentially transforms into a few-shot generation task rather than true migration.", 320 "evidence": "In RQ3, the authors observe that 'similar scenarios in different apps share similar core business logic, but test procedures may vary greatly' and 'the migration task essentially transforms into a few-shot generation task that uses an existing test script as an example' (Section 4.3).", 321 "supported": "moderate" 322 }, 323 { 324 "claim": "ChatGPT lacks intrinsic ability to self-correct or autonomously acquire information about target applications.", 325 "evidence": "In cross-platform migration (Section 4.2), ChatGPT 'operates solely on the basis of human-provided information, blindly altering the old script without any inbuilt capacity for problem resolution.' When information was intentionally excluded, ChatGPT could not compensate.", 326 "supported": "moderate" 327 } 328 ], 329 "methodology_tags": [ 330 "case-study", 331 "qualitative" 332 ], 333 "key_findings": "ChatGPT (gpt-3.5-turbo) can generate syntactically correct Appium test scripts for mobile apps when provided with detailed element identifiers and scenario descriptions, but faces significant limitations including context memory overflow on complex scenarios, inconsistent API usage (including deprecated APIs), and inability to self-correct. Cross-platform migration works when sufficient difference information is provided, but cross-app migration effectively reduces to new script generation rather than true migration. The approach requires substantial human effort to provide input information and fix technical issues in generated scripts.", 334 "red_flags": [ 335 { 336 "flag": "No quantitative metrics", 337 "detail": "The entire evaluation is qualitative narrative. No success rates, completion rates, correctness scores, or any other quantitative metric is reported despite testing ~18 experiments. Claims of 'strong capability' rest on anecdotal observation." 338 }, 339 { 340 "flag": "Very small sample size", 341 "detail": "Only 6 apps tested across 2 categories (email, travel), with ~9 experiments for RQ1, ~3 for RQ2, and ~6 for RQ3. This is far too few to support general claims about LLM capabilities for mobile test automation." 342 }, 343 { 344 "flag": "Claims outrun evidence", 345 "detail": "The paper claims 'LLMs have a strong capability' and frames the work as investigating 'LLM' capabilities broadly, but tests only gpt-3.5-turbo. The title generalizes to 'LLM' from a single model's qualitative results." 346 }, 347 { 348 "flag": "No baselines or comparisons", 349 "detail": "Despite discussing several existing test generation and migration tools (CraftDroid, AppTestMigrator, Appium-based approaches) in Section 6, no comparison is made against any of them. There is no way to assess whether LLMs are better, worse, or equivalent to existing approaches." 350 }, 351 { 352 "flag": "Irreproducible", 353 "detail": "No code, data, prompts, or ChatGPT conversation logs are released. The prompt templates contain only placeholders. Without specific app versions, API settings, or actual prompts, the experiments cannot be reproduced." 354 } 355 ], 356 "cited_papers": [ 357 { 358 "title": "Application of Large Language Models to Software Engineering Tasks: Opportunities, Risks, and Implications", 359 "authors": ["I. Ozkaya"], 360 "year": 2023, 361 "relevance": "Directly surveys LLM applications in software engineering, including code generation, testing, and documentation." 362 }, 363 { 364 "title": "Teaching Large Language Models to Self-Debug", 365 "authors": ["X. Chen", "M. Lin", "N. Schärli", "D. Zhou"], 366 "year": 2023, 367 "arxiv_id": "2304.05128", 368 "relevance": "Proposes self-debugging with LLMs via few-shot demonstrations, relevant to LLM-based automated program repair." 369 }, 370 { 371 "title": "Large Language Models are Few-Shot Testers: Exploring LLM-Based General Bug Reproduction", 372 "authors": ["S. Kang", "J. Yoon", "S. Yoo"], 373 "year": 2022, 374 "arxiv_id": "2209.11515", 375 "relevance": "Introduces LIBRO, using LLMs to automate test generation from bug reports — directly relevant to LLM-based testing." 376 }, 377 { 378 "title": "Adaptive Test Generation Using a Large Language Model", 379 "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"], 380 "year": 2023, 381 "arxiv_id": "2302.06527", 382 "relevance": "TestPilot uses LLMs for automatic unit test generation with adaptive prompting, directly relevant to LLM test generation capabilities." 383 }, 384 { 385 "title": "Prompting is All Your Need: Automated Android Bug Replay with Large Language Models", 386 "authors": ["S. Feng", "C. Chen"], 387 "year": 2023, 388 "arxiv_id": "2306.01987", 389 "relevance": "AdbGPT uses LLMs with prompt engineering for automated Android bug replay, closely related to LLM-based mobile testing." 390 }, 391 { 392 "title": "Fill in the Blank: Context-Aware Automated Text Input Generation for Mobile GUI Testing", 393 "authors": ["Z. Liu", "C. Chen", "J. Wang", "X. Che", "Y. Huang", "J. Hu", "Q. Wang"], 394 "year": 2022, 395 "arxiv_id": "2212.04732", 396 "relevance": "QTypist uses pre-trained LLMs to generate semantic input text for mobile GUI testing, directly relevant to LLM-aided mobile testing." 397 }, 398 { 399 "title": "ChatGPT: A Study on Its Utility for Ubiquitous Software Engineering Tasks", 400 "authors": ["G. Sridhara", "S. Mazumdar"], 401 "year": 2023, 402 "arxiv_id": "2305.16837", 403 "relevance": "Evaluates ChatGPT on various software engineering tasks, providing baseline for LLM capability assessment." 404 }, 405 { 406 "title": "Is ChatGPT the Ultimate Programming Assistant–How Far Is It?", 407 "authors": ["H. Tian", "W. Lu", "T. O. Li", "X. Tang", "S.-C. Cheung", "J. Klein", "T. F. Bissyandé"], 408 "year": 2023, 409 "arxiv_id": "2304.11938", 410 "relevance": "Evaluates ChatGPT as a programming assistant, relevant to understanding LLM capabilities in software engineering." 411 }, 412 { 413 "title": "Prompt Sapper: LLM-Empowered Software Engineering Infrastructure for AI-Native Services", 414 "authors": ["Z. Xing", "Q. Huang", "Y. Cheng", "L. Zhu", "Q. Lu", "X. Xu"], 415 "year": 2023, 416 "arxiv_id": "2306.02230", 417 "relevance": "Proposes LLM-powered infrastructure for software engineering, relevant to agentic AI programming tools." 418 }, 419 { 420 "title": "ChatGPT Prompt Patterns for Improving Code Quality, Refactoring, Requirements Elicitation, and Software Design", 421 "authors": ["J. White", "S. Hays", "Q. Fu", "J. Spencer-Smith", "D. C. Schmidt"], 422 "year": 2023, 423 "arxiv_id": "2303.07839", 424 "relevance": "Catalogues prompt engineering patterns for SE tasks, relevant to LLM-based software engineering methodology." 425 }, 426 { 427 "title": "Test Transfer Across Mobile Apps Through Semantic Mapping", 428 "authors": ["J.-W. Lin", "R. Jabbarvand", "S. Malek"], 429 "year": 2019, 430 "relevance": "CraftDroid performs cross-app test migration via semantic mapping — a key baseline for the test migration task this paper investigates." 431 }, 432 { 433 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 434 "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "F. Xia", "E. Chi", "Q. V. Le", "D. Zhou"], 435 "year": 2022, 436 "arxiv_id": "2203.11171", 437 "relevance": "Foundational work on chain-of-thought prompting, referenced as part of prompt engineering advancement enabling LLM capabilities." 438 } 439 ], 440 "engagement_factors": { 441 "practical_relevance": { 442 "score": 1, 443 "justification": "Explores a practical problem (mobile test automation) but provides no usable tool, framework, or replicable methodology." 444 }, 445 "surprise_contrarian": { 446 "score": 0, 447 "justification": "Confirms expected findings — LLMs can generate some code but have limitations with context and API correctness." 448 }, 449 "fear_safety": { 450 "score": 0, 451 "justification": "No safety, security, or risk concerns raised." 452 }, 453 "drama_conflict": { 454 "score": 0, 455 "justification": "No controversy or conflict angle." 456 }, 457 "demo_ability": { 458 "score": 0, 459 "justification": "No code, tool, or demo released." 460 }, 461 "brand_recognition": { 462 "score": 1, 463 "justification": "Uses ChatGPT which is widely recognized, but the paper itself is from an academic lab without high brand recognition." 464 } 465 } 466 }