scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34954B)
      1 {
      2   "paper": {
      3     "title": "Large-scale, Independent and Comprehensive study of the power of LLMs for test case generation",
      4     "authors": [
      5       "Wendkûuni C. Ouédraogo",
      6       "Abdoul Kader Kaboré",
      7       "Yinghua Li",
      8       "Haoye Tian",
      9       "Anil Koyuncu",
     10       "Jacques Klein",
     11       "David Lo",
     12       "Tegawendé F. Bissyandé"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2407.00225",
     17     "doi": "10.48550/arXiv.2407.00225"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "LLM-generated unit tests achieve higher readability than EvoSuite (21-40% improvement) but dramatically underperform in compilability (only 7.2% compile) and fault detection (0% mutation score). Reasoning-based prompting (GToT, CoT) consistently improves test structure, extractability, and compilability over zero-shot and few-shot approaches, challenging prior claims that prompting has limited impact. Hallucination-driven compilation failures—particularly 'Cannot Find Symbol' errors at up to 86%—remain the dominant obstacle to practical LLM-based test generation.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Replication package URL provided: https://anonymous.4open.science/r/LLM4TS-0F76/ containing 'prompt templates, evaluation scripts, and results' (Section 1, contribution ❻). Note: this is an anonymous review link which may not persist."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper provides 'a benchmark of 216,300 tests across 690 Java classes' in the replication package. Defects4J and SF110 are existing public benchmarks. The Data Availability Statement confirms datasets and code are in the repository."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Hardware is mentioned for EvoSuite (AMD EPYC 7552, 640GB RAM) in Section 3.5, and individual tools are named (Javalang, JaCoCo, TsDetect, Pitest, tiktoken). However, no requirements.txt, library versions, or environment specification sufficient to recreate the setup is provided in the paper."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package URL is given but the paper itself does not describe how to run the experiments, in what order, or with what commands."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Main results (MSR, CSR, compilability in Tables 3-7) are reported as point estimate percentages without confidence intervals or error bars. Coverage tables (14-16) show distribution statistics (Q1/Q3/median) but these describe the data distribution, not uncertainty in the estimates."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper makes many comparative claims (e.g., 'GToT consistently outperforms,' 'GPT-3.5-Turbo outperformed GPT-4') but provides no statistical significance tests (no p-values, no t-tests, no Mann-Whitney U, no bootstrap tests) anywhere in the paper."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Percentage improvements with baseline context are reported throughout: e.g., 'LLM-generated test suites improve readability over EvoSuite by 21–40%' (Finding 14), specific MSR/CSR values per technique (Tables 4-5), compilability rates per technique (Table 7). Baseline values are always provided alongside improvements."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No power analysis or statistical justification for sample sizes. The 690 classes across 3 datasets are pragmatically chosen. CMD is only 31 classes from 2 projects, and the paper acknowledges this is small (Section 5) but does not justify whether it is sufficient for the claims made."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Coverage results (Tables 14-16) report Min, Q1, Median, Q3, Max, and Mean across test suites. EvoSuite was run 30 times per class, and LLM prompts were executed 30 times per test suite (Section 5). The interquartile range provides spread measures."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "EvoSuite (with DynaMOSA algorithm) serves as the primary baseline throughout the study. Additionally, different prompting techniques serve as baselines for each other."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "EvoSuite is the standard SBST tool for Java unit test generation, and the paper cites multiple recent studies using it as a baseline (Tang et al. 2024, Shamshiri et al. 2015). The LLMs evaluated include GPT-4 (March 2023) and Mixtral 8x7B (December 2023), which were contemporary at the time of the study."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The five prompting techniques (ZSL, FSL, CoT, ToT, GToT) form a systematic ablation where GToT builds on CoT + ToT, allowing assessment of each component's contribution. Results are reported per-technique across all metrics and datasets."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper evaluates MSR, CSR, syntactic correctness, compilability, readability (Scalabrino model), cyclomatic complexity, cognitive complexity, code coverage (line, instruction, method), execution outcomes (pass/fail/timeout), mutation testing, test smell detection, and SpotBugs analysis—over a dozen distinct metrics across 7 research questions."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of LLM-generated test quality is performed. The paper uses automated readability models (Scalabrino et al. 2018) correlated with human assessments, but no humans directly evaluated the generated tests. Section 5 acknowledges manual validation of only 'a subset of test cases' for test smells, which is verification of detection tools, not evaluation of test quality."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The LLMs are pre-trained models evaluated without fine-tuning on any of the benchmarks. CMD was specifically created with projects after May 2023 to serve as an unseen dataset. No selection decisions were made based on the evaluation data."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are extensively broken down by model (4 LLMs), prompting technique (5), dataset (3), and metric type. Tables 3-19 provide per-condition results. Error types are broken down by category in Tables 8, 18 and Figures 5, 7."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Extensive failure analysis: compilation error categorization with 22 error types (Table 8), runtime failure classification (Table 18), hallucination pattern analysis (Figures 5, 7), and discussion of why mutation testing yielded 0% (Section 4.6). Finding 22 identifies three key error patterns."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Multiple negative results reported: 0% mutation score (Finding 23), only 7.2% compilation rate (Finding 7), FSL underperforming ZSL in multiple metrics (Finding 2), GPT-3.5 outperforming GPT-4 in some cases, and 100% failure rate for Mistral 7B and Mixtral 8x7B on CMD."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims GToT 'significantly enhances test reliability, compilability, and structural adherence' — supported by Tables 4-7. Claims of 'high compilation failure rates (up to 86%)' — supported by Table 7 (compilability as low as 0%). Claims about hallucination patterns — supported by Tables 8, 18."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims about prompting techniques (e.g., 'GToT consistently enhances test quality'). The experimental design controls for this by varying only the prompting technique while holding the model and dataset constant, which is adequate for causal claims about prompt effects on output quality."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims 'Large-scale, Independent and Comprehensive study of the power of LLMs for test case generation' but the study covers only 4 LLMs, only Java/JUnit, and GPT-4 was tested on only 31 classes. The abstract and title do not bound claims to Java or the specific models tested. Section 5 acknowledges some limitations but the title overclaims."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 5 discusses multiple alternative explanations: data leakage for Defects4J performance ('performance may reflect memorization rather than genuine reasoning,' Finding 19), LLM output randomness, model knowledge cutoff effects, CMD's small size possibly explaining unexpected GPT-3.5 > GPT-4 results."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper explicitly distinguishes between proxy and outcome throughout. Section 4.6 states 'coverage indicates broad execution' but 'doesn't guarantee usefulness or fault detection.' Finding 21 notes 'higher pass rates don't necessarily demonstrate superior fault detection capability.' They separate readability from maintainability as distinct constructs (Section 2.4)."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Table 1 lists 'GPT 3.5-turbo,' 'GPT 4,' 'Mistral 7B,' and 'Mixtral 8x7B' with release dates but no specific API version strings or snapshot dates (e.g., 'gpt-3.5-turbo-0613'). The study period is mentioned as 'November 2023–April 2024' in Section 5 but exact model versions are not specified."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Actual prompt text is provided for CoT (Figure 3a), ToT (Figure 3b), and GToT (Figure 4) with placeholders {class_name} and {source_code} that are deterministic from the datasets. Section 3.4 describes the NLD components. 'All prompts are available in our replication package.' The fill values (class source code) come from the public datasets."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.3: 'all models are set to a standard temperature of 0.7.' Token limit: 4,096 tokens. Token counting via tiktoken. EvoSuite: 3-minute time budget, DynaMOSA algorithm, 30 runs per class. Pitest timeout: 4 seconds."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The study directly prompts LLMs with class source code and collects outputs. No tool use, retry logic, or feedback mechanisms."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3 describes: token counting and filtering with tiktoken (excluding inputs > 4,096 tokens), code extraction via regex with delimiters, syntax checking via Javalang, compilation via JVM, and test execution classification. Figure 2 provides a pipeline overview. CMD curation criteria are described in Section 3.6."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 'Threats to Validity' provides substantive discussion organized into External Validity and Internal Validity subsections, covering generalizability, data contamination, model variability, and methodology limitations."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Specific threats discussed: CMD's small size of 2 projects (Section 5), GPT-3.5 outperforming GPT-4 as dataset-dependent anomaly, Mistral 7B's older knowledge base (2021-2022), Pitest analysis limited to 'Passed' tests only, test smell detection tools' high false-positive rates (citing Panichella et al. 2022), and budget constraints limiting GPT-4 evaluation."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 5 explicitly states: 'Our evaluation of four models from OpenAI and MistralAI further limits the breadth of our conclusions. Future work should explore a wider range of models.' Also: 'CMD's small size of just two projects restricts our observations.' And: 'Our findings reflect these models' capabilities during our study period (November 2023–April 2024), and newer versions may demonstrate different results.'"
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The Data Availability Statement says 'The datasets and code used in the present study are available in our repository' at the anonymous URL. The replication package contains 216,300 generated test files and evaluation results."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3.6 describes dataset selection: SF110 derived from DynaMOSA benchmark (346 classes from 117 projects), Defects4J (835 bugs across 17 projects, 477 classes used), CMD (2 projects: ODC and Conductor OSS, 31 classes with buggy-fixed pairs, selected for token constraints and post-May 2023 commits). Table 2 provides complexity breakdown."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are standard benchmarks (Defects4J, SF110) and curated open-source projects (CMD) with documented selection criteria."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Figure 2 provides a visual pipeline overview with 11 numbered steps from prompt engineering through evaluation. Each step (code extraction, MSR/CSR evaluation, syntax checking, compilation, test execution, coverage measurement, mutation testing, test smell detection) is described in the corresponding RQ sections."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funding section lists: Luxembourg National Research Fund (FNR), grant reference AFR PhD bilateral, project reference 17185670, and European Research Council (ERC) under Horizon 2020, grant agreement No. 949014."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All eight authors' affiliations are listed: University of Luxembourg (SnT Centre), Singapore Management University, The University of Melbourne, and Bilkent University. None are affiliated with OpenAI or MistralAI (the companies whose models are evaluated)."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "FNR and ERC are public research funding agencies with no financial interest in whether specific LLMs or EvoSuite perform better. The funding is for general research, not product evaluation."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Declarations section explicitly states: 'The authors declare that they have no conflict of interest.'"
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Table 1 lists release dates (not training cutoffs) for each model. Section 5 mentions 'Mistral 7B's older knowledge base (2021-2022)' but does not provide explicit training data cutoff dates for any model, particularly GPT-3.5 and GPT-4."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 4.5 (Finding 19) explicitly discusses overlap: 'Defects4J's broad accessibility raises data leakage concerns, as LLMs may have encountered its test cases during pretraining.' Cites Sallou et al. (2023) showing 'ChatGPT can retrieve detailed information about specific Defects4J bugs, suggesting performance may reflect memorization rather than genuine reasoning.'"
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "CMD was specifically created to address contamination: 'including only classes committed after May 2023 to reflect modern Java practices and prevent potential data leakage' (Section 3.6). The paper explicitly compares performance across contamination-prone (Defects4J) and contamination-mitigated (CMD) datasets."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. It is a benchmark evaluation of LLM-generated test suites."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. The Ethical Approval section states: 'This article does not contain any studies with human participants or animals.'"
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No API costs, tokens consumed, or per-example inference costs are reported. The paper only mentions 'budgetary reasons' for limiting GPT-4 and Mixtral evaluation to CMD (Table 3 note) without quantifying actual costs."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Hardware is specified for EvoSuite (AMD EPYC 7552, 640GB RAM, Section 3.5) but no total compute budget (GPU hours, total API spend, wall-clock time for the full experiment) is stated. The total cost of 216,300 LLM API calls is not quantified."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "LLM experiments were run 30 times per test suite and EvoSuite 30 times per class to address output randomness (Section 5). Coverage results (Tables 14-16) report distribution statistics (Min, Q1, Median, Q3, Max, Mean) across these runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 3.5: 'we run EvoSuite 30 times per class, following (Tang et al., 2024).' Section 5: 'we mitigated by executing each prompt 30 times per test suite and performing statistical analyses.'"
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Temperature is set to 0.7 for all models as a 'standard' value (Section 3.3) with no justification for this choice and no exploration of alternatives. No hyperparameter search was conducted or reported."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "All 5 prompting techniques × 4 models × 3 datasets are reported comprehensively in Tables 3-19. No selective reporting of best configurations — all conditions are shown."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper makes numerous comparisons (4 models × 5 prompts × 3 datasets × multiple metrics) with no statistical tests at all, let alone corrections for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "GToT is the authors' novel prompting technique and consistently performs best in their evaluation. The paper does not acknowledge the bias of evaluating their own method against alternatives, nor do they provide independent evaluation."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "EvoSuite gets a 3-minute time budget per class (Section 3.5), but LLM inference time and cost are not reported. The paper does not compare performance at matched compute budgets. GPT-4 API costs are mentioned as limiting but not quantified."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The paper extensively questions whether its metrics measure what matters: Section 4.6 argues coverage doesn't imply fault detection, Finding 21 notes 'higher pass rates don't necessarily demonstrate superior fault detection capability,' and mutation testing (Section 4.6) is used specifically to test whether passing tests actually verify program behavior."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved. LLMs are prompted directly with source code; no agentic framework, tool use, or multi-step workflow is used."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "CMD was created specifically to address temporal leakage: 'including only classes committed after May 2023 to reflect modern Java practices and prevent potential data leakage' (Section 3.6). Section 4.5 discusses how Defects4J's public availability creates temporal leakage risk."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The paper does not discuss whether the evaluation setup (providing full class source code) leaks information that would not be available in realistic test generation scenarios, or whether the prompting structure itself provides hints beyond real usage."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "The paper does not discuss whether examples across its three datasets share structural similarities, whether the same projects or code patterns appear in training data and test data, or whether Defects4J and SF110 projects overlap."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "CMD serves as a temporal split leakage prevention method (projects created after May 2023). The paper also cites Sallou et al. (2023)'s analysis demonstrating ChatGPT can retrieve Defects4J-specific information, providing empirical evidence of contamination in older benchmarks."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "GToT consistently enhances test quality—improving structure, reducing hallucinations, and increasing extractability and compilability across models and datasets.",
    374       "evidence": "Tables 4-7 show GToT achieves highest MSR, CSR, and compilability rates across most model-dataset combinations. E.g., GPT-3.5-turbo with GToT reaches 9.67% compilability on SF110 vs 5.94% for ZSL (Table 7). GToT also achieves lowest indentation violations (Figure 6a).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Only 7.2% of all LLM-generated test suites and 12% of syntactically correct ones successfully compile, with class-level dependencies being the primary challenge.",
    379       "evidence": "Table 7 shows compilability rates ranging from 0% to 20.16% across all model-prompt-dataset conditions. Finding 7 aggregates to 7.2% overall and 12% of syntactically correct suites. Prior method-level studies reported higher rates.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LLM-generated tests outperform EvoSuite in readability by 21-40%.",
    384       "evidence": "Table 12: GPT-3.5-Turbo GToT mean readability 84.75% vs EvoSuite 50.77%. Mistral 7B GToT 76.35% vs EvoSuite 50.77%. Consistent across all models and prompting techniques in Tables 11-13.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "EvoSuite consistently outperforms all LLMs in code coverage metrics (line, instruction, method coverage).",
    389       "evidence": "Tables 14-16: EvoSuite median 94.34% line coverage vs GPT-3.5-Turbo best median 75.47% (CoT). EvoSuite 100% median method coverage vs GPT-3.5-Turbo best 100% (CoT) but with much lower Q1. EvoSuite mean 91.75% line coverage vs best LLM mean 55.56%.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "LLM-generated passed tests yielded 0% mutation score, as they primarily targeted interfaces, abstract classes, or trivial methods.",
    394       "evidence": "Section 4.6 describes mutation testing with Pitest finding 0% line coverage and 0% mutation coverage for passed LLM-generated tests. Finding 23 explains these tests 'failed to execute meaningful code, making them ineffective for fault detection.'",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Hallucination-driven 'Cannot Find Symbol' errors dominate compilation failures, reaching up to 86.47% of all compilation errors.",
    399       "evidence": "Table 8: CFS errors at 86.48% (GPT-3.5 CMD), 84.21% (GPT-3.5 Defects4J), 82.0% (Mistral 7B SF110), 76.99% (GPT-4 CMD). 'Package Does Not Exist' errors are also significant for Mistral models (up to 23.13%).",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Reasoning-based prompting challenges prior claims that in-context learning is ineffective for test generation in code-specialized LLMs.",
    404       "evidence": "GToT and CoT improvements over ZSL/FSL across Tables 4-7, 12-16. The paper explicitly challenges Yang et al. (2024) who found limited prompting impact on code-specialized models. However, the effect sizes are modest in some metrics.",
    405       "supported": "moderate"
    406     },
    407     {
    408       "claim": "Defects4J performance likely reflects memorization rather than genuine reasoning due to training data contamination.",
    409       "evidence": "Section 4.5 Finding 19 cites Sallou et al. (2023) showing ChatGPT retrieves Defects4J-specific information. LLMs achieve much higher coverage on Defects4J (82.74% line median) than SF110 (3.57%) or CMD (varies). However, this is correlational—complexity differences between datasets could also explain the gap.",
    410       "supported": "moderate"
    411     }
    412   ],
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance tests",
    416       "detail": "Despite making dozens of comparative claims across 4 models × 5 prompts × 3 datasets × 7 RQs, the paper includes no statistical significance tests whatsoever. All claims of superiority are based on comparing raw percentages without testing whether differences are statistically meaningful."
    417     },
    418     {
    419       "flag": "Severely unbalanced model evaluation",
    420       "detail": "Due to budget constraints, GPT-4 and Mixtral 8x7B were tested only on CMD (31 classes from 2 projects), while GPT-3.5 and Mistral 7B were tested on all three datasets. This makes cross-model comparisons unreliable—GPT-4 conclusions are drawn from a tiny, non-representative subset."
    421     },
    422     {
    423       "flag": "Overclaiming in title",
    424       "detail": "The title claims 'Large-scale, Independent and Comprehensive' but GPT-4 was tested on only 31 classes, the study covers only Java/JUnit, and only 4 LLMs from 2 providers are evaluated. The 'large-scale' claim is driven by 216,300 generated files from 30 repetitions, not breadth of evaluation."
    425     },
    426     {
    427       "flag": "CMD is too small for generalization",
    428       "detail": "The CMD dataset (meant to address contamination) contains only 31 classes from 2 projects (ODC and Conductor OSS). Several model-prompt combinations on CMD have very few or zero compilable tests, making percentage-based findings unreliable. E.g., Mistral 7B had 0 compilable tests for ZSL and CoT on CMD."
    429     },
    430     {
    431       "flag": "Model versions not pinned",
    432       "detail": "The study period spans November 2023–April 2024 during which OpenAI regularly updates GPT models behind the same API name. Without pinned model versions (e.g., gpt-3.5-turbo-0613), results may not be reproducible."
    433     },
    434     {
    435       "flag": "Self-comparison bias for GToT",
    436       "detail": "GToT is the authors' novel contribution and consistently appears as the best or among the best prompting strategies. The authors do not acknowledge the inherent bias of evaluating their own method, nor do they provide independent evaluation."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Using large language models to generate junit tests: An empirical study",
    442       "authors": ["M. L. Siddiq", "J. C. Da Silva Santos", "R. H. Tanvir", "N. Ulfat", "F. Al Rifat", "V. Carvalho Lopes"],
    443       "year": 2024,
    444       "relevance": "Empirical evaluation of ChatGPT-generated JUnit tests focusing on compilability, correctness, and coverage — directly comparable to this study's scope."
    445     },
    446     {
    447       "title": "Chatgpt vs sbst: A comparative assessment of unit test suite generation",
    448       "authors": ["Y. Tang", "Z. Liu", "Z. Zhou", "X. Luo"],
    449       "year": 2024,
    450       "relevance": "Comparative evaluation of ChatGPT and EvoSuite for test generation; foundational baseline study for this work."
    451     },
    452     {
    453       "title": "On the evaluation of large language models in unit test generation",
    454       "authors": ["L. Yang", "C. Yang", "S. Gao", "W. Wang", "B. Wang", "Q. Zhu", "X. Chu", "J. Zhou", "G. Liang", "Q. Wang"],
    455       "year": 2024,
    456       "relevance": "Cross-LLM evaluation for test generation with prompting analysis; this paper directly challenges their claims about reasoning-based prompting."
    457     },
    458     {
    459       "title": "An empirical evaluation of using large language models for automated unit test generation",
    460       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    461       "year": 2023,
    462       "relevance": "Early empirical evaluation of LLM-based unit test generation establishing foundational benchmarks."
    463     },
    464     {
    465       "title": "Evaluating and improving chatgpt for unit test generation",
    466       "authors": ["Z. Yuan", "M. Liu", "S. Ding", "K. Wang", "Y. Chen", "X. Peng", "Y. Lou"],
    467       "year": 2024,
    468       "relevance": "Evaluation of ChatGPT for test generation with iterative improvement methods; demonstrates prompt structure affects test quality."
    469     },
    470     {
    471       "title": "Breaking the silence: the threats of using llms in software engineering",
    472       "authors": ["J. Sallou", "T. Durieux", "A. Panichella"],
    473       "year": 2023,
    474       "arxiv_id": "2312.08055",
    475       "relevance": "Documents threats of LLM use in SE including training data contamination in benchmarks like Defects4J — key reference for this study's contamination discussion."
    476     },
    477     {
    478       "title": "Llm hallucinations in practical code generation: Phenomena, mechanism, and mitigation",
    479       "authors": ["Z. Zhang", "Y. Wang", "C. Wang", "J. Chen", "Z. Zheng"],
    480       "year": 2024,
    481       "arxiv_id": "2409.20550",
    482       "relevance": "Analysis of LLM hallucination patterns in code generation; this paper extends their findings to test generation contexts."
    483     },
    484     {
    485       "title": "Automated unit test improvement using large language models at meta",
    486       "authors": ["N. Alshahwan", "J. Chheda", "A. Finogenova", "B. Gokkaya", "M. Harman", "I. Harper", "A. Marginean", "S. Sengupta", "E. Wang"],
    487       "year": 2024,
    488       "relevance": "Industrial-scale LLM test improvement at Meta, demonstrating production deployment of LLM-based test generation."
    489     },
    490     {
    491       "title": "Language models are few-shot learners",
    492       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    493       "year": 2020,
    494       "relevance": "Foundational GPT-3 paper establishing few-shot prompting paradigm used in this study's FSL evaluation."
    495     },
    496     {
    497       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    498       "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "F. Xia", "E. Chi", "Q. V. Le", "D. Zhou"],
    499       "year": 2022,
    500       "relevance": "Foundational CoT prompting paper; CoT is one of the five prompting techniques evaluated in this study."
    501     },
    502     {
    503       "title": "Unit test generation using generative ai: A comparative performance analysis of autogeneration tools",
    504       "authors": ["S. Bhatia", "T. Gandhi", "D. Kumar", "P. Jalote"],
    505       "year": 2024,
    506       "relevance": "Comparative analysis of AI-based test generation tools; found LLM-generated Python tests suffer from incorrect assertions."
    507     },
    508     {
    509       "title": "Software testing with large language models: Survey, landscape, and vision",
    510       "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"],
    511       "year": 2024,
    512       "relevance": "Comprehensive survey of LLM-based software testing covering the landscape this empirical study contributes to."
    513     }
    514   ]
    515 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs