scan.json (26454B)
1 { 2 "paper": { 3 "title": "Automated Unit Test Improvement using Large Language Models at Meta", 4 "authors": [ 5 "Nadia Alshahwan", 6 "Jubin Chheda", 7 "Anastasia Finegenova", 8 "Beliz Gokkaya", 9 "Mark Harman", 10 "Inna Harper", 11 "Alexandru Marginean", 12 "Shubho Sengupta", 13 "Eddy Wang" 14 ], 15 "year": 2024, 16 "venue": "FSE 2024 (32nd ACM Symposium on the Foundations of Software Engineering)", 17 "arxiv_id": "2402.09171", 18 "doi": "" 19 }, 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No source code or repository link is provided. The tool is an internal Meta system and the paper explicitly states details are 'commercially sensitive.'" 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "No dataset is released. All experiments were conducted on internal Meta codebases (Instagram, Facebook) and no data is made available." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No environment specifications, dependency lists, or infrastructure details are provided. The LLMs used are internal and unnamed ('LLM1' and 'LLM2')." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No reproduction instructions are provided. The tool operates on internal Meta infrastructure with internal LLMs, and no steps are given to recreate the setup." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates (e.g., '75% of test classes had at least one new test case that builds correctly') without uncertainty quantification." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No statistical significance tests are used. Comparisons between LLM1 vs. LLM2, different temperatures, and different prompts are made based solely on raw success rate numbers without any statistical testing." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "While the paper reports raw percentages (e.g., 25% coverage improvement rate, 73% acceptance rate), these are descriptive counts rather than standardized effect sizes. The paper notes 'the effect size was low' for temperature comparisons (Section 3.3) but does not quantify it." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "Sample sizes are reported (86 test classes for evaluation, 1,979 test classes for deployment) but no justification is given for why these sizes are adequate, and no power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported. The paper reports medians (e.g., 'median number of lines of code added by a TestGen-LLM test in the test-a-thon was 2.5') without any spread measure." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper compares TestGen-LLM performance to human engineers at the test-a-thon (Table 1), and contextualizes results against prior work such as SapFix acceptance rates (50% vs. 73%), Siddiq et al.'s coverage results, and other LLM test generation tools." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Comparisons reference contemporary work from 2023 including Nie et al., Yuan et al., Siddiq et al., and Schafer et al. The related work section discusses recent LLM-based test generation literature." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 3.3 evaluates the contribution of individual components: two LLMs (LLM1, LLM2), four prompt strategies, and temperature settings. Results show unique contributions of each prompt and LLM combination." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics are reported: build success rate (75%), pass rate (57%), coverage improvement rate (25%), acceptance rate by engineers (73%), and per-test-case success rates." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Engineers reviewed TestGen-LLM's generated diffs as part of the deployment. The 73% acceptance rate reflects human evaluation of the system's outputs. Additionally, the paper reports manual verification that generated tests covered valid corner cases (Section 3.2.1)." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "No held-out test set is used. The evaluation in Section 3.3 was conducted on 86 Kotlin components, but there is no separation of tuning and evaluation sets. Tuning decisions (default temperature, LLM, prompt) were made on the same data used to report results." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by platform (Facebook vs. Instagram, Tables 3 and 6), by LLM (LLM1 vs. LLM2, Table 5), by temperature (Table 4), and by prompt strategy (Section 3.3)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 3.4 discusses the 4 rejected diffs and their reasons: trivial methods, single responsibility violations, missing assertions. Section 5 discusses LLM 'self-plagiarism', TODO-only tests, and other failure modes." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports that the statement_to_complete prompt 'failed to add any unique test cases' for LLM1 (Section 3.3). It also notes the low success rate per test case (~4%), and that only 25% of classes gained coverage despite 57% passing." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims (75% build, 57% pass, 25% coverage increase, 11.5% class improvement, 73% acceptance) are supported by data in Sections 3.3 and 4. The abstract claim of '11.5%' is slightly higher than the '10%' reported in Section 4, but both figures appear in the paper (10% overall from Section 4, 11.5% from combined test-a-thon data)." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper makes implicit causal claims such as the ensemble approach producing better results, temperature zero being optimal, and LLM2 outperforming LLM1. However, deployment mode results are confounded by incremental coverage accumulation (as the paper itself acknowledges in Section 4), and no controlled experiments isolate these factors." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'Automated Unit Test Improvement using Large Language Models at Meta' but the abstract claims 'the first report on industrial scale deployment of LLM-generated code.' The results are specific to Kotlin test classes on Instagram and Facebook with two internal Meta LLMs, yet the paper frames contributions in terms of 'Assured LLMSE' as a general paradigm without adequately bounding generalizability to other languages, systems, or LLMs." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "The paper acknowledges confounding factors in deployment results (Section 4): incremental coverage accumulation making later trials harder, the test-a-thon context potentially inflating acceptance, and different sample sizes for different configurations making comparisons unreliable. The paper explicitly warns against over-interpreting temperature=0.4 results due to confounds." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "The LLMs are referred to only as 'LLM1' and 'LLM2' with no version, size, or architecture details. The paper states: 'Because details are commercially sensitive (and not relevant to this paper), we do not give details of the two LLMs' (footnote 1)." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": true, 148 "justification": "Table 2 provides the four prompt templates with their full text. These include placeholders ({existing_test_class}, {class_under_test}) which are filled with actual code — the fill values are the test classes and classes under test, which are deterministic inputs rather than hand-crafted content." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "Temperature settings are reported and explored (Table 4, temperatures 0.0 to 0.9). The default temperature of 0.0 is stated. Section 3.3 describes the hyperparameter sweep." 154 }, 155 "scaffolding_described": { 156 "applies": true, 157 "answer": true, 158 "justification": "The three-stage filtration pipeline (builds → passes → improves coverage) is described in detail in Section 2, including the flakiness filter (5 repeated executions), coverage measurement, and the pre/post-processing steps. Figure 1 depicts the architecture." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": true, 163 "justification": "The paper describes how test classes were selected: Section 3.2 explains classes targeted were those subject to recent refactoring, Section 3.3 identifies 86 Kotlin components (31 Stories, 55 Reels). The filtration pipeline stages are documented with counts (Figure 2 Sankey diagram)." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": false, 170 "justification": "There is no dedicated limitations or threats-to-validity section. The paper has a 'Future Work and Open Problems' section (Section 7) but this discusses research directions, not study limitations." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "No threats to validity are systematically discussed. While the paper mentions specific confounds in Section 4 (incremental coverage making comparisons unfair), these are scattered observations rather than a structured discussion of validity threats." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Kotlin, to Meta's internal LLMs, or to the specific test-a-thon context. The general framing as 'Assured LLMSE' suggests broad applicability without stating limitations." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "No raw data is available. All results come from internal Meta deployments and no supplementary data is provided." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "The data collection is described: telemetry logging of every execution (Section 1), specific date ranges (Oct 29 - Dec 29, 2023), and which products/platforms were targeted (Instagram Reels, Stories, Facebook)." 193 }, 194 "recruitment_methods_described": { 195 "applies": false, 196 "answer": false, 197 "justification": "No human participants were recruited for a study. The engineers were doing their normal jobs at test-a-thons, not participating in a research study." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": true, 202 "justification": "The filtration pipeline from raw LLM generation to final accepted tests is well-documented: generates → builds filter → passes filter → coverage filter → human review. Figure 2 provides a Sankey diagram with percentages at each stage." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding source is explicitly disclosed. All authors are from Meta, which implicitly funded the work, but no formal funding acknowledgment is present." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "All authors are listed as Meta Platforms Inc. employees. The paper title includes 'at Meta' making the affiliation prominent." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": true, 218 "answer": false, 219 "justification": "Meta employees are evaluating a Meta-internal tool (TestGen-LLM) deployed on Meta products. Meta has a direct interest in the tool's success. The funder (Meta) is not independent of the outcome." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests or financial interests statement is present in the paper." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "No training data cutoff date is stated for either LLM1 or LLM2. The LLMs are internal Meta models and no details about their training data are provided." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "No discussion of whether the test classes used in evaluation were part of the LLMs' training data. Since these are internal Meta LLMs likely trained on internal Meta code, overlap is plausible but not addressed." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "The evaluation is on internal Meta code with internal Meta LLMs. The possibility that these LLMs were trained on the very test classes they are asked to extend is not discussed." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "This is not a human subjects study. Engineers participated in test-a-thons as part of their normal work, not as research participants." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "Not a human subjects study. Engineers were doing their regular work evaluating code diffs." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "Not a human subjects study. The 36 engineers in the test-a-thon were performing their normal work duties." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "Not a human subjects study. Engineers were part of existing test-a-thon teams." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "Not a human subjects study. No experimental conditions were assigned to participants." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "Not a human subjects study. Engineers were informed they were receiving LLM-generated diffs." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "Not a human subjects study." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": false, 285 "justification": "No inference costs, API costs, or latency figures are reported. The number of trials is given (e.g., 32,531 total) but not the cost per trial or total compute consumed." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "No computational budget, GPU hours, or hardware specifications are stated. The paper runs tens of thousands of LLM inference trials plus repeated test executions without quantifying the compute cost." 291 } 292 } 293 }, 294 "claims": [ 295 { 296 "claim": "75% of TestGen-LLM's test cases built correctly, 57% passed reliably, and 25% increased coverage on Instagram Reels and Stories.", 297 "evidence": "Section 3.3 reports results over 86 Kotlin components (31 Stories, 55 Reels). Figure 2 Sankey diagram depicts the filtration funnel.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "TestGen-LLM improved 10% (abstract says 11.5%) of all test classes to which it was applied across the three test-a-thons.", 302 "evidence": "Section 4 states '196 test classes were successfully improved, while the TestGen-LLM tool was applied to a total of 1,979 test classes' yielding approximately 10%.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "73% of TestGen-LLM's test improvements were accepted by developers for production deployment.", 307 "evidence": "Section 4 states this figure. Breakdown across test-a-thons: Instagram Dec had 36/42 accepted; Facebook Dec had 144/280 accepted.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "This is the first report on industrial scale deployment of LLM-generated code with assured improvement guarantees.", 312 "evidence": "Stated in the abstract and conclusions. No counter-evidence is presented, but the claim of 'first' is inherently hard to verify.", 313 "supported": "weak" 314 }, 315 { 316 "claim": "Temperature zero produces the best default results for test generation.", 317 "evidence": "Section 3.3 and Table 4. However, the paper itself cautions that sample sizes are highly unequal (30,483 trials at temp 0 vs. 334 at temp 0.4) and confounding factors exist.", 318 "supported": "weak" 319 }, 320 { 321 "claim": "Each LLM and prompt strategy makes a unique contribution in the ensemble approach.", 322 "evidence": "Section 3.3 shows that 3 of 4 prompts for LLM1 and all 4 for LLM2 contributed unique test cases not found by other combinations.", 323 "supported": "moderate" 324 } 325 ], 326 "methodology_tags": [ 327 "case-study", 328 "benchmark-eval" 329 ], 330 "key_findings": "Meta's TestGen-LLM tool uses an ensemble of LLMs with a three-stage filtration pipeline (builds, passes, improves coverage) to extend existing Kotlin unit test classes. On 86 Instagram components, 75% of generated tests built, 57% passed reliably, and 25% increased coverage. Deployed across three test-a-thons on Instagram and Facebook, it improved 10% of 1,979 targeted test classes, with 73% of recommendations accepted by engineers. The paper demonstrates that combining multiple LLMs and prompt strategies in an ensemble with automated quality filters can produce production-ready test improvements at industrial scale.", 331 "red_flags": [ 332 { 333 "flag": "Company evaluating its own product", 334 "detail": "All 9 authors are Meta employees evaluating TestGen-LLM, an internal Meta tool, deployed on internal Meta codebases using internal Meta LLMs. There is a strong institutional interest in positive results. No external evaluation or independent replication is present." 335 }, 336 { 337 "flag": "Opaque LLMs prevent independent assessment", 338 "detail": "The two LLMs used ('LLM1' and 'LLM2') are internal Meta models with no disclosed architecture, size, training data, or version information. This makes the results impossible to reproduce or verify, and makes it impossible to assess data contamination." 339 }, 340 { 341 "flag": "No uncertainty quantification", 342 "detail": "All results are reported as point estimates without confidence intervals, error bars, significance tests, or variance measures. With 32,531 total trials, statistical analysis would be straightforward but is absent." 343 }, 344 { 345 "flag": "Confounded deployment results acknowledged but not mitigated", 346 "detail": "The paper acknowledges that deployment-mode results are confounded by incremental coverage accumulation (Section 4), making comparisons between temperatures, LLMs, and prompts unreliable. Despite this acknowledgment, the confounded numbers are still presented as results." 347 }, 348 { 349 "flag": "No limitations or threats-to-validity section", 350 "detail": "For a top-venue empirical paper, the absence of a dedicated limitations section is notable. Scope boundaries are not stated, and generalization claims (Assured LLMSE as a paradigm) extend well beyond the tested setting." 351 }, 352 { 353 "flag": "Acceptance rate may be inflated by test-a-thon context", 354 "detail": "Engineers were in test-a-thon mode (focused on writing tests) and in some cases pre-warned about LLM-generated diffs. The 73% acceptance rate may not reflect acceptance in normal engineering workflows. The paper partially acknowledges this for the Instagram test-a-thon but not systematically." 355 } 356 ], 357 "cited_papers": [ 358 { 359 "title": "Large Language Models for Software Engineering: Survey and Open Problems", 360 "authors": ["Angela Fan", "Beliz Gokkaya", "Mitya Lyubarskiy", "Mark Harman", "Shubho Sengupta", "Shin Yoo", "Jie Zhang"], 361 "year": 2023, 362 "relevance": "Comprehensive survey of LLM applications in software engineering, covering test generation and other tasks relevant to the survey scope." 363 }, 364 { 365 "title": "Software Testing with Large Language Model: Survey, Landscape, and Vision", 366 "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen", "Zhe Liu", "Song Wang", "Qing Wang"], 367 "year": 2023, 368 "arxiv_id": "2307.07221", 369 "relevance": "Literature review of 102 papers on LLM-based testing, debugging, and repair — directly relevant to understanding the landscape of AI-assisted testing research." 370 }, 371 { 372 "title": "Exploring the Effectiveness of Large Language Models in Generating Unit Tests", 373 "authors": ["Mohammed Latif Siddiq", "Joanna C. S. Santos", "Ridwanul Hasan Tanvir", "Noshin Ulfat", "Fahmid Al Rifat", "Vinicius Carvalho Lopes"], 374 "year": 2023, 375 "arxiv_id": "2305.00418", 376 "relevance": "Evaluates LLM unit test generation effectiveness with CodeX on HumanEval and EvoSuite benchmarks — a baseline comparison point for test generation quality." 377 }, 378 { 379 "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation", 380 "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu", "Shiji Ding", "Kaixin Wang", "Yixuan Chen", "Xin Peng"], 381 "year": 2023, 382 "arxiv_id": "2305.04207", 383 "relevance": "Evaluates ChatGPT's unit test generation with prompt engineering, reporting roughly one-third executable tests — a key comparison point." 384 }, 385 { 386 "title": "Adaptive Test Generation Using a Large Language Model", 387 "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"], 388 "year": 2023, 389 "arxiv_id": "2302.06527", 390 "relevance": "Reports 70% statement-level coverage for LLM-generated tests on smaller systems, relevant for comparing LLM test generation effectiveness." 391 }, 392 { 393 "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models", 394 "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"], 395 "year": 2023, 396 "relevance": "Hybrid approach combining SBST with LLMs for test generation — directly relevant to understanding ensemble/hybrid testing methodologies." 397 }, 398 { 399 "title": "SapFix: Automated End-to-End Repair at Scale", 400 "authors": ["Alexandru Marginean", "Johannes Bader", "Satish Chandra", "Mark Harman", "Yue Jia", "Ke Mao", "Alexander Mols", "Andrew Scott"], 401 "year": 2019, 402 "relevance": "Prior Meta deployment of automated program repair with 50% acceptance rate, providing a direct comparison point for industrial deployment of AI-assisted code generation." 403 }, 404 { 405 "title": "Evaluating Large Language Models Trained on Code", 406 "authors": ["Mark Chen et al."], 407 "year": 2021, 408 "arxiv_id": "2107.03374", 409 "relevance": "Introduces CodeX and HumanEval benchmark, foundational to LLM code generation evaluation." 410 }, 411 { 412 "title": "Assured LLM-Based Software Engineering", 413 "authors": ["Nadia Alshahwan", "Mark Harman", "Alexandru Marginean", "Shubho Sengupta", "Eddy Wang"], 414 "year": 2024, 415 "relevance": "Companion paper defining the Assured LLMSE paradigm that TestGen-LLM instantiates — central to the methodological framework being evaluated." 416 }, 417 { 418 "title": "Code Llama: Open Foundation Models for Code", 419 "authors": ["Baptiste Rozière et al."], 420 "year": 2023, 421 "arxiv_id": "2308.12950", 422 "relevance": "Meta's open-source code LLM, relevant to understanding the landscape of code-generation models evaluated in the survey." 423 }, 424 { 425 "title": "Effective Test Generation Using Pre-trained Large Language Models and Mutation Testing", 426 "authors": ["Arghavan Moradi Dakhel", "Amin Nikanjam", "Vahid Majdinasab", "Foutse Khomh", "Michel C Desmarais"], 427 "year": 2023, 428 "relevance": "Hybridizes LLMs with mutation testing for test generation — relevant to understanding complementary approaches to improving LLM-generated tests." 429 }, 430 { 431 "title": "Learning Deep Semantics for Test Completion", 432 "authors": ["Pengyu Nie", "Rahul Banerjee", "Junyi Jessy Li", "Raymond J. Mooney", "Milos Gligoric"], 433 "year": 2023, 434 "arxiv_id": "2302.10166", 435 "relevance": "Reports 29% executable tests with TeCo, providing a baseline for LLM test generation quality comparison." 436 } 437 ] 438 }