scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33088B)
      1 {
      2   "paper": {
      3     "title": "Test Wars: A Comparative Study of SBST, Symbolic Execution, and LLM-Based Approaches to Unit Test Generation",
      4     "authors": [
      5       "Azat Abdullin",
      6       "Pouria Derakhshanfar",
      7       "Annibale Panichella"
      8     ],
      9     "year": 2025,
     10     "venue": "International Conference on Software Testing, Verification and Validation (ICST)",
     11     "arxiv_id": "2501.10200",
     12     "doi": "10.1109/ICST62969.2025.10989033"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "ChatGPT-4o is the best LLM for automated unit test generation among the four tested, but still falls behind EvoSuite (SBST) and Kex (symbolic execution) in line and branch coverage. LLM-based generation shows better median mutation scores, suggesting deeper semantic understanding, but fails to reproduce any real-world bugs from GitBug Java (0% fault reproduction vs 5.88% for EvoSuite and 7.35% for Kex). Feature-based analysis reveals all tools are hurt by cyclomatic complexity and internal dependencies, with LLM-based approaches especially sensitive to class size (SLOC).",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The pipeline source code is released at https://github.com/plan-research/tga-pipeline (footnote 1). A reproduction package is also available at Zenodo (https://doi.org/10.5281/zenodo.13862019), though the Zenodo package notes the source code was withheld during double-blind review."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "They use the publicly available GitBug Java dataset and provide a reproduction package at Zenodo with 'dataset collection scripts and the results presented in the evaluation' (Section IX)."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Specific tool versions are given: EvoSuite 1.2.0, Kex 0.0.8, TestSpark at a specific commit (linked). JDK 11 is specified. The pipeline is deployed as 'a swarm of Docker containers, cooperating via Docker Compose' (Section III)."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The pipeline repository and Zenodo reproduction package together provide the basis for reproduction. The paper describes the full pipeline architecture (Section III, Figure 1) including benchmark format, runner interface, and analysis module."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Results from 10 runs per tool per CUT are presented as boxplots (Figures 3, 4) showing median, IQR, and outliers. This constitutes spread visualization on the main results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Mann-Whitney U test is used for pairwise comparisons: 'we perform sound statistical analysis using the Mann-Whitney U test for the statistical significance' (Section IV-B). P-value threshold of 0.05 is used."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Vargha-Delaney Â12 effect size measure is reported: 'We further complement our analysis with the Vargha Delaney Â12 measure for effect size' with small/medium/large magnitude classifications (Section IV-B)."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The sample size of 136 bugs is determined by the available GitBug Java dataset after filtering for JDK 11 compatibility (199 → 136). No power analysis or explicit justification for statistical sufficiency is provided."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Results are reported across 10 runs per tool per CUT. Boxplots in Figures 3 and 4 show the full distribution including median, IQR, and outliers. Descriptive statistics include both median and mean."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Three tools are compared: EvoSuite (SBST), Kex (symbolic execution), and TestSpark with multiple LLMs. Each serves as a baseline for the others."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "EvoSuite 1.2.0 and Kex 0.0.8 are recent versions of state-of-the-art tools that have competed in SBFT competitions. ChatGPT-4o is a current-generation LLM. The benchmark (GitBug Java 2024) is also contemporary."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study is performed. RQ1 compares different LLMs within TestSpark, but this is model comparison rather than component ablation. The prompt design, feedback loop, and context reduction strategies are not individually ablated."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Five metrics are used: compilation rate, line coverage, branch coverage, mutation score, and failure reproduction rate (Section III-C)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The authors 'manually analyzed the subset of 32 benchmarks that demonstrated significant differences in tool performances' (Section VI), providing qualitative expert evaluation of generated test outputs and tool behaviors."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "GitBug Java is used as the sole evaluation benchmark with no tuning on it. Default parameters are used for all tools. The dataset was specifically chosen because it was published after the LLMs' training data collection period."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Venn diagrams (Figure 5) show per-CUT winning tools. Correlation analyses break down performance by code features (Figure 6) and branch types (Figure 7). Manual analysis covers specific benchmark projects."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Extensive failure analysis: ChatGPT-4's 1229/1360 runs failing due to context size errors, TestSpark's 0% fault reproduction, compilation failures, and manual analysis of specific failing benchmarks (Section VI)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Multiple negative results: TestSpark-ChatGPT-4o failed to reproduce any bugs (0%), smaller LLMs had <7% compilation rates, ChatGPT-4 had 1229/1360 context-size failures, and all tools performed poorly on fault reproduction overall."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims LLM-based approach 'significantly outperforms them in mutation scores,' but Section V-B shows EvoSuite has the best average mutation score (30.56%) while TestSpark-ChatGPT-4o only has better median (6.32%). The statistical test results (Venn diagram 5c) show a more nuanced picture than 'significantly outperforms.'"
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims are appropriately hedged. The paper says mutation score results 'seem to suggest ChatGPT-4o might have a better understanding' and context size 'played a role.' The controlled comparison (same benchmark, same time budget, 10 runs) supports the comparative claims made."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper consistently bounds findings: 'among the models investigated in this study' (Section V-A), 'it is hard to generalize beyond our benchmark and dataset' (Section VI). Results are specific to Java, the GitBug Java dataset, and the three tools tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section VII discusses specific alternative explanations: time budget differences between LLM and traditional tools, dataset project imbalance (70 traccar, 29 jsoup), data leakage through pre-existing projects, and non-determinism effects."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Claims are made at the level of what is measured — coverage, mutation score, compilation rate, fault reproduction — without overclaiming general 'test quality.' They note mutation score 'characterizes bug-discovering abilities' and separately measure actual fault reproduction to cross-validate."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "LLMs are identified as 'ChatGPT-4,' 'ChatGPT-4o,' 'Llama Medium,' and 'Code Llama 70b' without snapshot dates or API versions. EvoSuite (1.2.0) and Kex (0.0.8) have versions, but the LLM versions are marketing names only."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 2 shows the full default prompt template used by TestSpark. The placeholders ($CODE, $METHODS, $POLYMORPHISM) are mechanically derived from the CUT (source code, method signatures, polymorphic relations) as described in Section IV-A, allowing reconstruction."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens) are reported for the GPT-4o API calls. Tool-level settings are partially described (DynaMOSA for EvoSuite, concolic mode for Kex), but sampling parameters for the LLMs are absent."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "TestSpark's scaffolding is described: three-step approach (prompt collection, LLM request, feedback loop), iterative prompt size reduction when exceeding context limits, headless IntelliJ IDEA execution for code inspection, and per-test extraction for compilation (Sections II-B, III)."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Filtering from 199 GitBug Java bugs to 136 (JDK 11 compatibility) is documented. CUT selection from bug patches via manual analysis is described. Test extraction into separate source files for granular compilation rate is explained (Section III-A, III-C)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section VII 'Threats to Validity' is a dedicated section with three subsections: internal, external, and conclusion threats."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats include: time budget handling differences between LLM and traditional tools, GitBug Java's project distribution imbalance (70 traccar, 29 jsoup out of 136), data leakage through pre-existing projects, and implementation bugs in tools causing compilation failures."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper states 'it is hard to generalize beyond our benchmark and dataset' (Section VI), acknowledges the dataset 'may not be representative enough' (Section VII), and bounds findings to the specific tools and models tested."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "A Zenodo reproduction package (https://doi.org/10.5281/zenodo.13862019) contains 'dataset collection scripts and the results presented in the evaluation' (Section IX)."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section III-A describes using GitBug Java (199 bugs from 55 repositories), filtering for JDK 11 compatibility (yielding 136 bugs from 24 repositories), extracting CUTs from bug patches with manual analysis for ambiguous cases."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. The data source is the GitBug Java benchmark, a standard publicly available dataset."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The full pipeline is documented: benchmark → runner → tool execution → analysis (Figure 1). Filtering from 199 → 136 bugs is explained. Test extraction, compilation checking, coverage measurement (JaCoCo), and mutation testing (PIT) steps are described."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Section X acknowledges: 'This work was conducted as part of the AI for Software Engineering (AI4SE) collaboration between JetBrains and Delft University of Technology. The authors gratefully acknowledge the financial support provided by JetBrains.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: JetBrains Research and TU Delft. First author Abdullin is the creator of Kex (reference [2]), and Derakhshanfar is a co-author of TestSpark (reference [48])."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "JetBrains funds the research and has a direct stake in two of the three tools being evaluated: TestSpark (a JetBrains Research product) and Kex (developed by the first author at JetBrains Research). JetBrains has a financial interest in these tools performing well."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is present. The paper does not explicitly declare the conflict that two of three tools evaluated are products of the funding organization, despite the connection being inferrable from affiliations and references."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No specific training data cutoff dates are stated for any of the LLMs (ChatGPT-4, ChatGPT-4o, Llama Medium, Code Llama 70b). The paper argues GitBug Java is recent but does not state when each model's training data ends."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Extensively discussed as Limitation 1 in Section I and in Section VII: 'the projects used in GitBug Java were still available for LLMs for (pre)training.' The paper chose GitBug Java specifically to mitigate data leakage (bugs committed in 2023, dataset published 2024)."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section I identifies benchmark contamination as a key limitation of prior work and selects GitBug Java because it 'has not been included in the training set for modern LLMs.' Section VII acknowledges residual risk since the underlying projects pre-date the LLMs."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study. This is a benchmark evaluation of automated test generation tools."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates software tools on a code benchmark."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "A 120-second time budget is set for EvoSuite and Kex, and TestSpark is noted to take 'about 2-3 minutes on each CUT.' However, no API costs, token consumption, or total dollar costs for the GPT-4o calls (10 runs × 136 CUTs) are reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated. The study runs 10 repetitions × 136 CUTs × 6 tool configurations but does not report total GPU hours, API spend, or hardware specifications."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Each tool is run 10 times with different seeds (and independent sessions for TestSpark). Results are shown as distributions via boxplots, explicitly addressing non-determinism across runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "'We repeated each execution 10 times as suggested by existing guidelines' (Section IV-A). The number is explicitly stated and justified by citation to Arcuri and Briand [4]."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "The paper explicitly states no hyperparameter search was performed: 'we use each tool's default (suggested) parameters as the default parameter values commonly used in the literature give reasonably acceptable results without incurring the additional computational cost required for parameter tuning' (Section IV-A), citing Arcuri & Fraser [5]."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "For RQ1, ChatGPT-4o is selected as the best LLM based on comprehensive pairwise statistical tests across all CUTs and all metrics, not cherry-picked from selective comparisons. Default configurations are used for all tools."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "1632 pairwise comparisons are performed for RQ1 alone (136 CUTs × 3 metrics × 4 model pairs), plus additional comparisons for RQ2. No Bonferroni, Holm, or other multiple comparison correction is mentioned despite the very large number of tests."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Two of three tools evaluated (Kex, TestSpark) are products of the authors' organizations. First author Abdullin is the creator of Kex [2], and Derakhshanfar is a co-author of TestSpark [48]. No discussion of self-evaluation bias appears in the paper."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper acknowledges that 'the 120-second time limit is not always enough for the intensive approach of Kex' and that TestSpark takes 2-3 minutes, but does not report performance as a function of compute budget or equalize compute across tools."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper discusses why GitBug Java was chosen (recency, data leakage mitigation) but does not discuss whether its bug distribution, project composition, or task structure adequately represents the broader space of unit test generation scenarios."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The three tools have fundamentally different architectures (genetic algorithm, constraint solver, LLM + feedback loop), yet differences are attributed to the approach categories (SBST vs symbolic execution vs LLM-based) rather than the specific tool implementations. TestSpark's scaffolding (feedback loop, prompt reduction) is confounded with the LLM's capability."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "GitBug Java contains bugs from 2023 commits and was published in 2024, after the LLMs' training periods. The paper explicitly addresses this: 'This relatively new dataset has not been included in the training set for modern LLMs' (Section III-A)."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. TestSpark receives CUT source code as input (the intended usage), but there is no analysis of whether the IDE-based code inspection features provide information that would not be available in a real-world scenario."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section VII identifies the dataset imbalance: 'Out of the 136 bugs used in our evaluation, 29 were related to jsoup project, and 70 were related to traccar project,' acknowledging non-independence due to project clustering."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "The paper relies on temporal arguments (dataset published after training) as a prevention strategy but does not apply any concrete detection methods such as canary strings, membership inference, or n-gram overlap analysis. They acknowledge residual risk without testing for it."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "ChatGPT-4o is the best LLM for automated unit test generation among the four models tested, achieving 57.97% compilation rate, 38.13% line coverage, and 24.63% branch coverage.",
    369       "evidence": "Section V-A, Figure 3. Out of 1632 pairwise comparisons, only two cases where ChatGPT-4o achieves statistically lower performance than another model. Other models had <7% compilation rate due to context size limitations.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "LLM-based test generation falls behind traditional methods (SBST, symbolic execution) in terms of code coverage.",
    374       "evidence": "Section V-B, Figure 4. EvoSuite: 46.44% line, 34.91% branch coverage. Kex: 38.60% line, 24.72% branch. TestSpark-ChatGPT-4o: 38.13% line, 24.63% branch. Statistical tests confirm EvoSuite's superiority in coverage (Figure 5a, 5b).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "LLM-based approach significantly outperforms traditional methods in mutation scores, suggesting deeper semantic understanding.",
    379       "evidence": "Section V-B. TestSpark-ChatGPT-4o has best median mutation score (6.32%) but EvoSuite has best average (30.56%). Venn diagram (Figure 5c) shows TestSpark wins on more CUTs for mutation score. The claim of 'significantly outperforms' is overstated given the mixed mean/median results.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "TestSpark-ChatGPT-4o failed to reproduce any real-world bugs, while EvoSuite and Kex reproduced 5.88% and 7.35% respectively.",
    384       "evidence": "Section V-B: 'TestSpark-ChatGPT-4o did not reproduce any bug. EvoSuite and Kex reproduced 5.88% and 7.35% of bugs, respectively.'",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "LLM-based approaches are especially sensitive to CUT size (SLOC), more so than traditional approaches.",
    389       "evidence": "Section V-C, Figure 6c. TestSpark-ChatGPT-4o shows 'a strong correlation with both the numbers of dependencies and SLOC' while Kex shows the lowest correlation with these features.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Context size is the primary factor explaining why non-ChatGPT-4o models performed poorly.",
    394       "evidence": "Section V-A: 'out of 1360 total runs of ChatGPT-4, 1229 failed with the context size error.' ChatGPT-4o has 128k context vs 4-8k for others.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Conflict of interest: authors evaluate their own tools",
    401       "detail": "Two of three tools evaluated (Kex and TestSpark) are products of the authors' organizations. First author Abdullin created Kex, co-author Derakhshanfar co-authored TestSpark. JetBrains funds the research and develops both tools. This conflict is not explicitly acknowledged or mitigated."
    402     },
    403     {
    404       "flag": "Overstated mutation score claim in abstract",
    405       "detail": "The abstract claims LLM-based approach 'significantly outperforms' traditional methods in mutation scores, but EvoSuite has a higher mean mutation score (30.56% vs TestSpark's lower mean). Only median favors TestSpark. The statistical comparison is more nuanced than the abstract suggests."
    406     },
    407     {
    408       "flag": "Severe dataset imbalance",
    409       "detail": "70 of 136 bugs (51.5%) come from a single project (traccar) and 29 (21.3%) from jsoup. Together, two projects account for 72.8% of the benchmark, threatening external validity and potentially biasing results toward these projects' characteristics."
    410     },
    411     {
    412       "flag": "No multiple comparison correction",
    413       "detail": "Over 1600 pairwise statistical tests are performed without Bonferroni or other family-wise error rate correction. At p=0.05, approximately 80 false positives would be expected by chance alone."
    414     },
    415     {
    416       "flag": "LLM hyperparameters not reported",
    417       "detail": "Temperature, top-p, and other sampling parameters for GPT-4o API calls are not reported. These settings significantly affect LLM output quality and reproducibility."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "EvoSuite: automatic test suite generation for object-oriented software",
    423       "authors": ["G. Fraser", "A. Arcuri"],
    424       "year": 2011,
    425       "relevance": "Foundational SBST tool for Java, one of three tools evaluated in this comparative study."
    426     },
    427     {
    428       "title": "CodaMosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    429       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    430       "year": 2023,
    431       "relevance": "Pioneering hybrid approach combining EvoSuite with LLMs for test generation, directly relevant to future hybrid strategies."
    432     },
    433     {
    434       "title": "TestSpark: IntelliJ IDEA's ultimate test generation companion",
    435       "authors": ["A. Sapozhnikov", "M. Olsthoorn", "A. Panichella", "V. Kovalenko", "P. Derakhshanfar"],
    436       "year": 2024,
    437       "relevance": "LLM-based test generation tool evaluated in this study, supports multiple LLM backends."
    438     },
    439     {
    440       "title": "ChatUniTest: A framework for LLM-based test generation",
    441       "authors": ["Y. Chen", "Z. Hu", "C. Zhi", "J. Han", "S. Deng", "J. Yin"],
    442       "year": 2024,
    443       "relevance": "LLM-based test generation framework for Java with feedback loop, a direct competitor to TestSpark."
    444     },
    445     {
    446       "title": "Breaking the silence: the threats of using LLMs in software engineering",
    447       "authors": ["J. Sallou", "T. Durieux", "A. Panichella"],
    448       "year": 2024,
    449       "relevance": "Discusses data leakage and evaluation threats when using LLMs in SE research, directly motivating this study's benchmark choice."
    450     },
    451     {
    452       "title": "ChatGPT vs SBST: A comparative assessment of unit test suite generation",
    453       "authors": ["Y. Tang", "Z. Liu", "Z. Zhou", "X. Luo"],
    454       "year": 2024,
    455       "relevance": "Prior comparative study of ChatGPT-3.5 vs EvoSuite that this paper extends with more models and symbolic execution."
    456     },
    457     {
    458       "title": "An empirical evaluation of using large language models for automated unit test generation",
    459       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    460       "year": 2023,
    461       "arxiv_id": "2302.06527",
    462       "relevance": "Early empirical study evaluating LLMs for unit test generation, establishing key evaluation methodology."
    463     },
    464     {
    465       "title": "CoverUp: Coverage-guided LLM-based test generation",
    466       "authors": ["J. A. Pizzorno", "E. D. Berger"],
    467       "year": 2024,
    468       "arxiv_id": "2403.16218",
    469       "relevance": "Extends CodaMosa with coverage-guided LLM feedback, representing the hybrid approach category."
    470     },
    471     {
    472       "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM",
    473       "authors": ["G. Ryan", "S. Jain", "M. Shang", "S. Wang", "X. Ma", "M. K. Ramanathan", "B. Ray"],
    474       "year": 2024,
    475       "relevance": "LLM-based method-level test generation using code-aware prompts targeting specific paths."
    476     },
    477     {
    478       "title": "GitBug-Java: A reproducible benchmark of recent Java bugs",
    479       "authors": ["A. Silva", "N. Saavedra", "M. Monperrus"],
    480       "year": 2024,
    481       "relevance": "The benchmark dataset used in this study, designed to address data leakage in LLM evaluation."
    482     },
    483     {
    484       "title": "Evaluating and improving ChatGPT for unit test generation",
    485       "authors": ["Z. Yuan", "M. Liu", "S. Ding", "K. Wang", "Y. Chen", "X. Peng", "Y. Lou"],
    486       "year": 2024,
    487       "relevance": "Prior evaluation and improvement study of ChatGPT for unit test generation."
    488     },
    489     {
    490       "title": "Using large language models to generate JUnit tests: An empirical study",
    491       "authors": ["M. L. Siddiq", "J. C. Da Silva Santos", "R. H. Tanvir", "N. Ulfat", "F. Al Rifat", "V. Carvalho Lopes"],
    492       "year": 2024,
    493       "relevance": "Empirical study on LLM-generated JUnit tests, partially addressing data leakage via SF110 dataset."
    494     },
    495     {
    496       "title": "LLM-powered test case generation for detecting tricky bugs",
    497       "authors": ["K. Liu", "Y. Liu", "Z. Chen", "J. M. Zhang", "Y. Han", "Y. Ma", "G. Li", "G. Huang"],
    498       "year": 2024,
    499       "arxiv_id": "2404.10304",
    500       "relevance": "LLM-based test generation approach targeting bug detection using program specifications."
    501     },
    502     {
    503       "title": "Unit test case generation with transformers and focal context",
    504       "authors": ["M. Tufano", "D. Drain", "A. Svyatkovskiy", "S. K. Deng", "N. Sundaresan"],
    505       "year": 2020,
    506       "arxiv_id": "2009.05617",
    507       "relevance": "Early transformer-based approach to unit test generation (AthenaTest), foundational for LLM-based methods."
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 2,
    513       "justification": "The open-source pipeline and tool comparison provide actionable guidance for practitioners choosing test generation tools, though the findings are Java-specific."
    514     },
    515     "surprise_contrarian": {
    516       "score": 1,
    517       "justification": "The finding that LLMs fail at fault reproduction (0%) while doing well on mutation score is mildly surprising, but LLMs underperforming traditional tools on coverage confirms existing intuitions."
    518     },
    519     "fear_safety": {
    520       "score": 0,
    521       "justification": "No safety or security concerns are raised; the paper is purely about test generation effectiveness."
    522     },
    523     "drama_conflict": {
    524       "score": 1,
    525       "justification": "The 'LLMs vs traditional methods' framing has mild drama but the results are nuanced rather than provocative."
    526     },
    527     "demo_ability": {
    528       "score": 1,
    529       "justification": "Pipeline code is available on GitHub but requires significant setup (Docker, IntelliJ headless mode, OpenAI API key, multiple tool installations)."
    530     },
    531     "brand_recognition": {
    532       "score": 1,
    533       "justification": "JetBrains is well-known in developer tooling. ChatGPT-4o is widely recognized. The venue (ICST) is respected but not broadly famous."
    534     }
    535   }
    536 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs