scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29848B)
      1 {
      2   "paper": {
      3     "title": "EvoGPT: Leveraging LLM-Driven Seed Diversity to Improve Search-Based Test Suite Generation",
      4     "authors": [
      5       "Lior Broide",
      6       "Roni Stern",
      7       "Argaman Mordoch"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2505.12424"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "EvoGPT, a hybrid LLM-SBST test generation system, achieves ~9-11% improvement in line coverage, branch coverage, and mutation score over both EvoSuite and TestART on 17 Defects4J projects. An ablation study shows that naive LLM+EA integration provides limited benefit, and that prompt diversity, temperature diversity, and plateau escape via diverse LLM injection each contribute incrementally to performance. Population size of 25 test suites saturates the performance gains.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Section IV-F (Reproducibility) states: 'Our code, data, and scripts are available at https://tinyurl.com/EvoGPT.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Defects4J is a publicly available benchmark. The authors additionally claim their data is available at their repository URL."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions JaCoCo 0.8.12, PITest 1.19.0, and gpt-4o-mini, but does not provide a comprehensive environment specification (no requirements.txt, Dockerfile, Java version, or dependency list). Tool versions are scattered across the text rather than in a dedicated environment section."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper provides a URL to the code repository but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section or specific commands are documented."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables III, V, and VI are reported as point estimates without confidence intervals, error bars, or ± notation."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Table IV reports Wilcoxon signed-rank tests comparing EvoGPT against both baselines across all three metrics, with p-values (all < 0.001)."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Table IV reports Cliff's delta (δ) as an effect size measure for all comparisons, ranging from 0.75 to 0.98, with interpretation guidelines provided (|δ| ≥ 0.474 = large effect)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The evaluation uses all 17 Defects4J projects, but no power analysis or justification is given for whether n=17 is sufficient for the statistical claims being made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No standard deviations, variance, or spread measures across experimental runs are reported. The authors explicitly acknowledge in Section IV-F (construct validity) that 'future work could analyze variance across seeds to better characterize the stability of the generated test suites.'"
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Two baselines are included: EvoSuite (EA-based SBST tool) and TestART (LLM-based test generation). Both are compared across all metrics and projects."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "TestART (2024) is a recent LLM-based approach shown to outperform ChatUniTest. EvoSuite is the standard SBST baseline still widely used. The paper justifies excluding CodaMosa, pytLMtester (Python-specific), and SearchSYS (system-level) as inapplicable to their Java unit-test setting."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table VI presents a comprehensive ablation study with 6 configurations: LLM-only, +EA, +Temperature diversity, +Prompt diversity, +Plateau recovery, and Full EvoGPT, measuring the incremental contribution of each component."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Three metrics are used: Line Coverage of Correct Tests (LCCT), Branch Coverage of Correct Tests (BCCT), and Mutation Score of Correct Tests (MSCT)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation is conducted. The authors acknowledge in construct validity (Section IV-F) that they 'did not measure other qualitative aspects such as readability, assertion relevance, or developer trust.'"
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No separation between tuning and evaluation data. Fitness weights were chosen from 'preliminary experiments' (Section III-B1) and the population size was tuned (Table V), but results are reported on the same Defects4J projects without a held-out split."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Table III provides per-project breakdowns for all 17 Defects4J projects across all three metrics, not just aggregate averages."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No qualitative failure analysis is provided. There is no discussion of specific classes or methods where EvoGPT fails, no error analysis, and no examples of failure modes."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The ablation study (Table VI) shows that adding a standard EA to LLM-generated tests provides only a 'limited increase in performance' (+1.5% LCCT, +1.3% BCCT, +0.7% MSCT). Table V shows diminishing returns beyond population size 25."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims '10% improvement in both code coverage and mutation score.' Table III shows LCCT: 92 vs 83/83 (~9%), BCCT: 90 vs 79/80 (~10-11%), MSCT: 87 vs 69/78 (~9-18%). The claim of ~10% average improvement is supported."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims about component contributions are made via ablation studies (Table VI), which use controlled single-variable manipulation — each row adds one component. This is an adequate design for causal attribution in this context."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper explicitly bounds results to Defects4J and Java. Section IV-F (external validity) states: 'Our experiments are limited to Defects4J projects only. Although these span a range of domains, they may not fully represent the complexity of industrial code bases.' Future work mentions extending to 'additional programming languages.'"
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section IV-F discusses specific alternative explanations: PITest may not detect equivalent mutants (falsely lowering mutation score), JaCoCo may miss paths due to bytecode instrumentation quirks, and LLM stochasticity means results may vary across runs."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "MSCT is explicitly described as 'a commonly used proxy for fault detection capability' (Section II). In construct validity, the authors acknowledge they 'did not measure other qualitative aspects such as readability, assertion relevance, or developer trust,' distinguishing automated proxy metrics from broader test quality."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper states 'gpt-4o-mini' without a snapshot date or API version. Section IV-F acknowledges: 'the LLM we used (gpt-4o-mini) is periodically updated by OpenAI,' indicating the model version is not pinned."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Table I shows prompt objectives (summaries of instructions), not the full prompt text. The paper states 'The exact system prompts used for each LLM agent are included in the provided code' — the actual prompts are deferred to the code repository rather than provided in the paper."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Temperature values (0.3, 0.4, 0.5, 0.6, 0.8), population size (25), evolutionary budget (25 generations), crossover probability (0.8), stagnation threshold (τ=5), fitness improvement threshold (α=0.5), max plateau escapes (k=3), and mutation probability (1/Ntests) are all reported."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section III describes the full pipeline in detail: source code preprocessing, 5 LLM agent configurations, generation-repair loop (4 max iterations), coverage enhancement with JaCoCo feedback, EA with ranked selection/crossover/mutation, plateau detection and LLM injection. Algorithm 1 provides pseudo-code. Figure 1 provides a system diagram."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section III-A states: 'we remove excessive or lengthy inline comments, documentation blocks, and unreachable code segments that could overload the model's context window.' Focal method extraction from public classes is also described."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section IV-E 'Limitations' discusses runtime, monetary cost, and evolutionary budget vs. wall-clock time. Section IV-F 'Threats to validity' covers internal, construct, and external validity."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Specific threats include: PITest may not detect equivalent mutants; JaCoCo may miss paths due to bytecode instrumentation; LLM stochasticity means results vary across runs; Defects4J-only evaluation; focal methods are public methods only; exact reproducibility depends on specific API configurations."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section IV-F explicitly states: experiments limited to Defects4J, may not represent industrial codebases; focal methods are public methods which 'may not capture the full interaction behavior in production scenarios'; and reproducibility depends on LLM access. Future work lists extending to 'additional programming languages.'"
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The paper states 'Our code, data, and scripts are available at https://tinyurl.com/EvoGPT,' suggesting raw experimental data is accessible alongside the code."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Table II describes all 17 Defects4J projects with identifiers, version numbers, and focal method counts. The paper explains that focal methods are 'the public methods of each public class under test.' Defects4J is a well-documented standard benchmark."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data source is the standard Defects4J benchmark."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "While the test generation pipeline is described, the data pipeline from generation to evaluation lacks specifics: how many tests were generated, how many were removed by the repair loop, how many passed compilation and execution. No filtering counts or attrition figures are reported."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding, acknowledgments, or grant information is mentioned anywhere in the paper."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All three authors list their affiliation as Ben-Gurion University of the Negev, Faculty of Computer and Information Science. No commercial product is being evaluated."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "No funding is disclosed; appears to be unfunded academic research with no commercial interest in the outcome."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial disclosure statement is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper uses gpt-4o-mini to generate tests for Defects4J Java projects but does not state the model's training data cutoff date. Since the LLM may have seen the source code during training, this is relevant."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Defects4J projects (e.g., Commons-Lang, Gson, JFreeChart) are well-known open-source projects likely in gpt-4o-mini's training data. No discussion of whether the model has seen the code under test, which could inflate test quality metrics."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "Defects4J was published in 2014 and its constituent projects are widely available on GitHub. gpt-4o-mini has almost certainly been trained on this code. The paper does not address this contamination risk despite it potentially inflating all reported metrics."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Table VII reports monetary cost per class: EvoGPT $0.32, TestART $0.01, EvoSuite $0.00. Average runtime per class is also reported: EvoGPT 8 min, TestART 2 min, EvoSuite 1 min."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Table VII reports average runtime per class (8 minutes for EvoGPT). The evolutionary budget is stated as population=25, generations=25. Per-class API costs are provided. While total API spend across all experiments is not given, the per-class figures allow estimation."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No multi-seed analysis is reported. The authors explicitly acknowledge in construct validity: 'future work could analyze variance across seeds to better characterize the stability of the generated test suites.'"
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper does not state how many experimental runs produced the reported results. Results appear to be from single runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Section III-B1 mentions 'In preliminary experiments, we evaluated several alternative weight configurations' for fitness weights, but does not report how many configurations were tried or the search method used."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The fitness weight configuration was selected because it 'provided the most stable convergence behavior and best structural and mutation-based metrics,' but no details on how many alternatives were compared or on what data the selection was made."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Six Wilcoxon signed-rank tests are reported (3 metrics × 2 baselines) without any correction for multiple comparisons (e.g., Bonferroni, Holm)."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors compare EvoGPT against EvoSuite and TestART without acknowledging the bias of evaluating their own system. While EvoSuite is used as-is (default configuration), the comparison inherently favors the authors' tuned system."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "EvoGPT takes 8 min/class vs 1-2 min for baselines, and $0.32 vs $0.00-$0.01. While these differences are reported (Table VII), no performance-as-a-function-of-compute analysis is provided. The evolutionary budget is matched but total compute is not."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper does not discuss whether Defects4J adequately represents real-world testing needs. No analysis of whether LCCT/BCCT/MSCT on Defects4J translates to actual bug-finding capability in production code."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "EvoGPT and TestART both use gpt-4o-mini, holding the model constant while comparing scaffolding approaches. The comparison is explicitly about the scaffold/approach, not the underlying model. EvoSuite uses no LLM, serving as an SBST-only baseline."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "Defects4J projects have been publicly available since 2014. gpt-4o-mini was likely trained on data including these projects' source code and existing test suites. This temporal overlap is not discussed."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The LLM receives source code as input to generate tests. If the model memorized existing tests for these well-known projects during training, it could reproduce rather than generate novel tests. This is not discussed."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether Defects4J code appears in gpt-4o-mini's training set, or whether the model's pre-existing knowledge of these open-source projects provides an unfair advantage."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "EvoGPT achieves on average a 10% improvement in code coverage and mutation score over TestART and EvoSuite on Defects4J.",
    368       "evidence": "Table III shows EvoGPT total averages: LCCT 92%, BCCT 90%, MSCT 87% vs TestART (83%, 80%, 78%) and EvoSuite (83%, 79%, 69%). Wilcoxon tests (Table IV) confirm statistical significance (p < 0.001 for all comparisons).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Naive integration of LLM-generated tests and EA optimization yields limited advantage.",
    373       "evidence": "Table VI ablation: LLM-only achieves 83.4/80.8/80.1 (LCCT/BCCT/MSCT); adding EA only increases to 84.9/82.1/80.8, a ~1-1.5% gain.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Prompt and temperature diversity are key to EvoGPT's performance.",
    378       "evidence": "Table VI ablation shows incremental gains from temperature diversity (+1.3-2.5%) and prompt diversity (+0.6-0.9%). The full diversity-enabled configuration adds 2% LCCT and MSCT over plateau recovery alone.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "EvoGPT's multi-prompt configuration produces semantically diverse initial test suites.",
    383       "evidence": "Section IV-C reports Jaccard similarity analysis: intra-configuration similarity 0.526 vs inter-configuration similarity 0.476, showing different configurations produce semantically distinct tests.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Population size of 25 is sufficient; larger populations yield no significant additional gains.",
    388       "evidence": "Table V shows MSCT: 0.87 at both population 25 and 30; LCCT: 0.92 at both; BCCT 0.90 vs 0.91.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "red_flags": [
    393     {
    394       "flag": "No variance or seed analysis for stochastic system",
    395       "detail": "EvoGPT uses stochastic LLM generation and evolutionary algorithms, yet all results appear to be from single runs. The authors acknowledge this gap ('future work could analyze variance across seeds') but report definitive numbers without any spread measures. Results could vary significantly across runs."
    396     },
    397     {
    398       "flag": "Unmatched compute budgets in baseline comparison",
    399       "detail": "EvoGPT costs $0.32/class and 8 min/class vs TestART's $0.01/2 min and EvoSuite's $0.00/1 min. While evolutionary budgets are matched (25 population, 25 generations), the 8x-32x cost difference is not accounted for in performance comparisons. EvoSuite with 8x more generations might close the gap."
    400     },
    401     {
    402       "flag": "Benchmark contamination risk unaddressed",
    403       "detail": "Defects4J consists of well-known open-source Java projects (Commons-Lang, Gson, JFreeChart, etc.) almost certainly in gpt-4o-mini's training data. The LLM may have memorized existing tests for these projects, inflating all LLM-based results (both EvoGPT and TestART). This affects absolute numbers but not the EvoGPT-vs-TestART comparison."
    404     },
    405     {
    406       "flag": "No multiple comparison correction",
    407       "detail": "Six Wilcoxon tests (3 metrics × 2 baselines) are performed without Bonferroni or other family-wise error rate correction. While all p-values are < 0.001, the absence of correction is a methodological gap."
    408     },
    409     {
    410       "flag": "Model version not pinned",
    411       "detail": "gpt-4o-mini is used without a version/snapshot identifier. The authors acknowledge the model 'is periodically updated by OpenAI,' meaning exact reproduction is not possible. Model updates between the experiments and any replication attempt could change results."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Whole test suite generation",
    417       "authors": ["G. Fraser", "A. Arcuri"],
    418       "year": 2013,
    419       "relevance": "Foundational SBST approach (EvoSuite) used as a primary baseline in this study."
    420     },
    421     {
    422       "title": "TestART: Improving LLM-based unit testing via co-evolution of automated generation and repair iteration",
    423       "authors": ["S. Gu", "Q. Zhang", "C. Fang", "F. Tian", "L. Zhu", "J. Zhou", "Z. Chen"],
    424       "year": 2024,
    425       "arxiv_id": "2408.03095",
    426       "relevance": "State-of-the-art LLM-based test generation approach used as the primary LLM baseline."
    427     },
    428     {
    429       "title": "CodaMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    430       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    431       "year": 2023,
    432       "relevance": "Pioneering LLM-SBST hybrid for test generation; inspired EvoGPT's plateau escape mechanism."
    433     },
    434     {
    435       "title": "ChatUniTest: A framework for LLM-based test generation",
    436       "authors": ["Y. Chen", "Z. Hu", "C. Zhi", "J. Han", "S. Deng", "J. Yin"],
    437       "year": 2024,
    438       "relevance": "LLM-based test generation framework using generation-repair loops, shown to be outperformed by TestART."
    439     },
    440     {
    441       "title": "Evaluating large language models trained on code",
    442       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    443       "year": 2021,
    444       "arxiv_id": "2107.03374",
    445       "relevance": "Foundational work on LLM code generation capabilities (Codex), enabling the LLM-based test generation paradigm."
    446     },
    447     {
    448       "title": "A3Test: Assertion-augmented automated test case generation",
    449       "authors": ["S. Alagarsamy", "C. Tantithamthavorn", "A. Aleti"],
    450       "year": 2024,
    451       "relevance": "LLM-based test generation approach focusing on assertion quality."
    452     },
    453     {
    454       "title": "An empirical evaluation of using large language models for automated unit test generation",
    455       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    456       "year": 2024,
    457       "relevance": "Empirical evaluation of LLM-based test generation (TestPilot), finding limitations in LLM-generated test quality."
    458     },
    459     {
    460       "title": "Optimizing search-based unit test generation with large language models: An empirical study",
    461       "authors": ["D. Xiao", "Y. Guo", "Y. Li", "L. Chen"],
    462       "year": 2024,
    463       "relevance": "Empirical study of where LLM assistance is most effective in EA-based test generation (initialization vs. during search)."
    464     },
    465     {
    466       "title": "Test Wars: A comparative study of SBST, symbolic execution, and LLM-based approaches to unit test generation",
    467       "authors": ["A. Abdullin", "P. Derakhshanfar", "A. Panichella"],
    468       "year": 2025,
    469       "relevance": "Comparative evaluation of SBST, symbolic execution, and LLM approaches to test generation."
    470     },
    471     {
    472       "title": "Mutation-guided LLM-based test generation at Meta",
    473       "authors": ["M. Harman", "J. Ritchey", "I. Harper", "S. Sengupta", "K. Mao", "A. Gulati", "C. Foster", "H. Robert"],
    474       "year": 2025,
    475       "relevance": "Industrial-scale LLM-based test generation at Meta using mutation-guided approach."
    476     },
    477     {
    478       "title": "Self-refine: Iterative refinement with self-feedback",
    479       "authors": ["A. Madaan", "N. Tandon", "P. Gupta"],
    480       "year": 2023,
    481       "relevance": "General LLM self-refinement framework relevant to iterative test repair approaches."
    482     },
    483     {
    484       "title": "LLM-enhanced evolutionary test generation for untyped languages",
    485       "authors": ["R. Yang", "X. Xu", "R. Wang"],
    486       "year": 2025,
    487       "relevance": "Hybrid LLM-EA test generation (pytLMtester) extending DynaMOSA for Python."
    488     }
    489   ],
    490   "engagement_factors": {
    491     "practical_relevance": {
    492       "score": 2,
    493       "justification": "Code is released and the approach could be integrated into Java test generation workflows, though the $0.32/class cost and 8-minute runtime limit immediate adoption."
    494     },
    495     "surprise_contrarian": {
    496       "score": 1,
    497       "justification": "The finding that naive LLM+EA integration provides limited benefit while diversity is key is somewhat surprising, but the overall result that hybrid approaches outperform individual ones confirms expectations."
    498     },
    499     "fear_safety": {
    500       "score": 0,
    501       "justification": "Test generation tool with no safety or security implications."
    502     },
    503     "drama_conflict": {
    504       "score": 0,
    505       "justification": "No controversy; straightforward benchmark comparison with incremental improvements."
    506     },
    507     "demo_ability": {
    508       "score": 2,
    509       "justification": "Code and scripts released at a public URL; requires Java setup and OpenAI API key to run."
    510     },
    511     "brand_recognition": {
    512       "score": 0,
    513       "justification": "Academic group from Ben-Gurion University; not a well-known AI lab or company."
    514     }
    515   }
    516 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs