scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31714B)
      1 {
      2   "paper": {
      3     "title": "LLM Test Generation via Iterative Hybrid Program Analysis",
      4     "authors": ["Sijia Gu", "Noor Nashid", "Ali Mesbah"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2503.13580",
      8     "doi": "10.1145/3744916.3764553"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Panta combines static control flow analysis with dynamic code coverage to iteratively guide LLMs in generating unit tests targeting uncovered execution paths. On 130 complex Java classes from Defects4J, Panta achieves 70.2% line coverage and 60.8% branch coverage with Llama 3.3 70B, outperforming a re-implemented SymPrompt baseline by 26 and 23 percentage points respectively. The ablation study shows the iterative framework contributes ~28pp coverage improvement over single-pass, while hybrid program analysis adds another ~10pp. Claude 3.5 Haiku performs best among four evaluated models, but mutation scores remain below 50% across all models, indicating LLM-generated tests still have weak fault detection.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Panta is publicly available at a GitHub repository (reference [25]: https://github.com/PANTA-TestAutomation/Panta). Section 6 states: 'we have made Panta's implementation and dataset publicly available.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses the publicly available Defects4J benchmark (v2.0.1). Section 6 states implementation and dataset are publicly available at [25]. Section 4.1 notes 'More details about individual classes are publicly available in our repository.'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section 4.1 specifies Java 8+, Maven v3.6.3, JUnit v4.13.2, JaCoCo v0.8.11, and Pitest v1.17.0. Combined with the public repository, this provides sufficient dependency information to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 6 (Reproducibility) states: 'we provide detailed instructions for replicating our experimental results' and points to the public repository at [25]."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 2, 3, and 4 report only point estimates for all metrics (line coverage, branch coverage, pass rate, mutation score). No confidence intervals, error bars, or ± notation are provided anywhere."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "A Paired T-test is used in Section 4.3 (ablation) to show Panta_basic and Panta_cov are NOT significantly different. However, no significance tests are applied to the main claims: the 26%/23% improvement over SymPrompt (RQ1), or the model ranking (RQ3). The core outperformance claims are based on comparing numbers without any statistical test."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage point differences are reported with baseline context throughout: '26.3% higher line coverage' with both values visible (70.18% vs 43.91% in Table 2). The ablation (Table 3) and model comparison (Table 4) similarly report differences with baseline values in the tables."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The 130 classes and 2,971 MUTs are selected by criteria (CYC > 10, non-abstract, public), but there is no justification for why this sample size is sufficient for the claims, no power analysis, and no acknowledgment of whether the sample is adequate."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Section 4.1 explicitly states: 'Given the low temperature, the generation process is largely deterministic, with only minor fluctuations. As a result, we perform a single run per class.' No variance, standard deviation, or spread measure is reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Panta is compared against SymPrompt (re-implemented from Ryan et al. 2024) in RQ1 (Table 2). The ablation study (RQ2) includes a single-pass baseline, Panta_basic, and Panta_cov variants."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "SymPrompt (2024) is described as a state-of-the-art LLM-based test generation technique. The paper also discusses CoverUp (2024) and HITS (2024) as recent related work, noting HITS could not be replicated."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ2 (Section 4.3, Table 3) presents a thorough ablation with four variants: single-pass baseline, Panta_basic (iterative framework only), Panta_cov (+ coverage info), and full Panta (+ path selection). This isolates each component's contribution."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Five evaluation metrics are used: line coverage, branch coverage, pass rate, mutation score, and high coverage count (HCC). This provides a multi-faceted view of test quality."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated (coverage measurement, mutation testing, pass rate). No human evaluation of test quality, readability, or usefulness is conducted, despite the paper citing readability as a motivation for LLM-based test generation over search-based approaches."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no discussion of separating development data from evaluation data. The prompt templates and algorithm parameters (e.g., maxNoIncreaseLimit, numOfAttempt=3) may have been tuned on the same Defects4J projects used for evaluation. No explicit dev/test split is described."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per project in Tables 2, 3, and 4 (14 projects). Table 3 additionally breaks results by complexity class (CYC_max ≤ 20 vs > 20). Individual class details are available in the repository."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5 (Limitations) discusses several failure modes: classes with heavy inheritance, LLMs' inability to fix runtime errors (AssertionError), low mutation scores, and difficulty with large utility classes. Section 4.2 discusses cases where SymPrompt outperforms Panta (Collections pass rate, JDatabind/Lang mutation score)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that SymPrompt achieves higher pass rate in Collections and higher mutation score in JDatabind and Lang (Section 4.2). The ablation shows Panta_cov sometimes performs slightly worse than Panta_basic. Average mutation score is below 50% for all models."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims '26% higher line coverage and 23% higher branch coverage compared to the state-of-the-art.' Table 2 shows 26.3pp higher line coverage (70.18% vs 43.91%) and 22.7pp higher branch coverage (60.83% vs 38.17%), matching the abstract."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'our iterative framework significantly enhances overall code coverage' and 'integrating code coverage-driven path selection further improves coverage by approximately 10%' are supported by the ablation study (Table 3), which uses controlled single-variable manipulation across four variants."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6 (External Validity) explicitly states: 'Our current implementation and evaluation focus on Java, which may impact the generalizability of our findings.' Section 5 (Application) bounds scope to Java SE projects with similar structural complexity. Though the title is broad, the paper body properly bounds claims."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 discusses several alternative explanations: data contamination (LLMs may have seen Defects4J code), Comex tool limitations affecting path extraction, model capability as a confound, and coverage vs fault detection distinction. These are specific to the study."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures coverage and mutation score and claims these indicate test quality. They explicitly acknowledge the gap: 'coverage alone does not ensure fault detection if test assertions are non-existent or weak' (Section 4.1), and add mutation score specifically to address this proxy gap. Claims are stated at the measurement level (coverage, mutation score) without inflating to broader quality claims."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper uses 'Llama 3.3 70B', 'Mistral Large 2', 'GPT-4o Mini', and 'Claude 3.5 Haiku'. These are marketing names without specific version strings, snapshot dates, or API version identifiers. No model checkpoint dates or API versions are stated."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Figure 4 shows a prompt template with placeholders ({source_file_numbered}, {test_file}, {selected_path_for_method_1}, etc.), not the full prompt text. The test repair prompt is not shown at all. Any system-level instructions or output format requirements are not shown. While the repository may contain full prompts, the paper itself only provides the template structure."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.1 reports: 'max_tokens set to 4096 for output generation and temperature set to 0.2' for all four LLMs. Algorithm parameters are also stated: numOfAttempt=3 for repair (Section 3.4), and the stopping conditions (maxCYC iterations, maxNoIncreaseLimit)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The iterative workflow is described in detail: Algorithms 1 (path extraction), 2 (path selection), and 3 (iterative framework) provide full pseudocode. Figure 2 shows the system overview. The test validation, repair loop, feedback mechanism, and stopping conditions are all documented."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 documents the class selection pipeline: starting from 14 Defects4J v2.0.1 projects, excluding 3 outdated projects, selecting non-abstract public classes with at least one method with CYC > 10, then manually excluding outliers (CYC > 40 switch patterns, inheritance-heavy classes). Final counts: 130 classes, 2,971 MUTs."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 (Discussion) contains a dedicated 'Limitations' subsection discussing source code context, test repair, and test effectiveness limitations. Section 6 (Threats to Validity) discusses internal and external validity threats."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed include: Comex bugs potentially affecting path extraction (mitigated by verification during implementation), data contamination with Defects4J, Java-only evaluation limiting generalizability, large utility class challenges, and LLMs' inability to fix runtime errors like AssertionError."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5 explicitly states Panta 'is most applicable to Java SE projects with similar structural complexity' and that extending to Java EE 'remains an important direction for future work.' Section 6 states the Java focus 'may impact the generalizability.' They note the absence of cross-class context."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4.1 states 'More details about individual classes are publicly available in our repository.' Section 6 says 'we have made Panta's implementation and dataset publicly available.' The public repository at [25] contains per-class results and the evaluation dataset."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 describes the subject selection in detail: Defects4J v2.0.1 latest fixed versions, 14 projects after excluding 3 outdated ones, class selection criteria (non-abstract, public, CYC > 10), manual exclusion of outliers. Table 1 provides full details per project."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data source is the standard Defects4J benchmark, a well-known public dataset in software testing research."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from subject selection through evaluation is documented: select Defects4J projects → filter classes by criteria → extract CFGs → run iterative test generation → validate tests → measure coverage and mutation score. Each step is described in Sections 3-4."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding section, acknowledgments, or grant information appears in the paper. Whether the research was funded is not disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All three authors are affiliated with the University of British Columbia, clearly stated in the paper header. They are not affiliated with any of the LLM providers evaluated (Meta, Mistral, OpenAI, Anthropic)."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. Absence of funding disclosure is not the same as absence of funding."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the four evaluated models (Llama 3.3 70B, Mistral Large 2, GPT-4o Mini, Claude 3.5 Haiku). This is relevant because Defects4J has been public since 2014."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 6 (Internal Validity) explicitly discusses: 'data contamination, i.e., the possibility that LLMs may have encountered parts of the Defects4J codebase during pretraining.' They mitigate by comparing relative improvements between prompting strategies using the same model."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 6 acknowledges Defects4J code may be in training data and provides a mitigation strategy: 'all evaluated prompting strategies use the same set of projects and the same underlying LLM. Our evaluation therefore reports relative improvements between prompting strategies rather than absolute performance.'"
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The evaluation is entirely automated on code benchmarks."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates an automated tool on open-source code."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants or experimental conditions assigned to humans."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 5 (Runtime) reports wall-clock time: 'the average runtime for Panta to generate a passing test suite is 2.3 minutes per method, or approximately 53 minutes per class.' Per-project breakdowns are also provided (e.g., Cli: 0.8 min/method, Collections: 9.4 min/method). No dollar costs or token counts are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Per-method and per-class runtimes are stated, but the total computational budget (total API spend, total wall-clock time for all experiments, or hardware used) is not quantified. For 2,971 methods × 4 models, this represents substantial compute that is not totaled."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Section 4.1 explicitly states: 'we perform a single run per class.' No seed sensitivity analysis is conducted despite using temperature=0.2 which still introduces stochasticity."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.1 explicitly states: 'we perform a single run per class and report the average results across all classes for each project.' The number of runs (one) is clearly stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Key parameters (temperature=0.2, max_tokens=4096, numOfAttempt=3, maxNoIncreaseLimit) are stated but no hyperparameter search is described. How these values were selected is not discussed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The selection of temperature=0.2, maxNoIncreaseLimit, and other parameters is not justified. The paper states these values without explaining why they were chosen or whether alternatives were considered."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple comparisons are made across 14 projects, 4 models, and multiple metrics. A paired t-test is used in one comparison (Section 4.3) but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The main baseline (SymPrompt) was re-implemented by the authors because 'neither HITS nor SymPrompt has publicly available implementations.' The bias of evaluating their own system against their own re-implementation of the baseline is not acknowledged or discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Panta is iterative (multiple LLM calls per class, up to maxCYC iterations) while SymPrompt is single-pass. This substantial compute difference is not discussed or controlled for. No compute-matched comparison is provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses Defects4J and measures coverage and mutation score as proxies for test quality. Whether coverage/mutation score on Defects4J classes with CYC > 10 is representative of real-world test generation challenges is not discussed."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "In RQ3 (Table 4), four different LLMs are compared using the same Panta scaffold, properly controlling for the scaffold variable. In RQ1, the scaffold IS the independent variable being evaluated (Panta vs SymPrompt), so there is no confound."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 6 discusses that LLMs may have seen Defects4J code during pretraining (Defects4J published 2014, all models trained after). They mitigate by comparing relative improvements between strategies rather than absolute performance."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage. For example, whether providing full source code and coverage reports in the prompt mirrors realistic development scenarios is not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The 130 classes come from only 14 projects. Classes within the same project share code patterns, dependencies, and coding styles. Non-independence across classes within projects is not discussed or addressed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline). The only mitigation is the relative comparison strategy."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Panta achieves 26% higher line coverage and 23% higher branch coverage compared to the state-of-the-art (SymPrompt)",
    365       "evidence": "Table 2 shows average line coverage of 70.18% (Panta) vs 43.91% (SymPrompt) and branch coverage of 60.83% vs 38.17% across 130 classes from 14 Defects4J projects. Panta outperforms on all 14 projects for both metrics.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "The iterative framework alone (Panta_basic) improves line coverage by 27.65% and branch coverage by 25.77% over a single-pass baseline",
    370       "evidence": "Table 3 ablation study shows baseline at 32.69% line / 24.87% branch coverage vs Panta_basic at 60.34% / 50.64%. Results reported across all 130 classes.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Hybrid program analysis (path selection) adds approximately 10% coverage improvement over iterative framework alone",
    375       "evidence": "Table 3 shows full Panta at 70.18% line / 60.83% branch vs Panta_basic at 60.34% / 50.64%, approximately 10pp improvement from adding static+dynamic path selection.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Claude 3.5 Haiku is the best-performing model across all metrics",
    380       "evidence": "Table 4 shows Claude achieves 73.20% line coverage, 66.52% branch coverage, 68.16% pass rate, 47.92% mutation score, and 62 HCC, leading all four models on every metric.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Panta maintains consistent coverage across complexity levels while other variants show significant drops for highly complex code",
    385       "evidence": "Table 3 complexity breakdown shows Panta with only 1pp drop for CYC_max > 20 (70.37% vs 69.42% line coverage) while baseline drops 6.6pp (34.01% vs 27.38%) and Panta_basic drops 6.1pp (61.56% vs 55.49%).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "LLM-generated tests have weak fault detection capability, with mutation scores below 50% across all models",
    390       "evidence": "Table 4 shows average mutation scores of 43.79% (Llama), 36.60% (Mistral), 34.86% (GPT), and 47.92% (Claude). Section 5 discusses this limitation explicitly.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Re-implemented baseline",
    397       "detail": "The main baseline (SymPrompt) was re-implemented by the authors because neither SymPrompt nor HITS has publicly available code. This introduces self-comparison bias (Lucic et al. 2018 showed authors' re-implementations of baselines systematically underperform). The 26pp coverage gap is large enough that even a suboptimal re-implementation might not fully explain it, but the bias is unacknowledged."
    398     },
    399     {
    400       "flag": "Single run with no variance",
    401       "detail": "Despite using temperature=0.2 (not 0), all results are from a single run per class. The paper argues 'the generation process is largely deterministic' but provides no evidence for this claim. Minor fluctuations could compound across 130 classes."
    402     },
    403     {
    404       "flag": "No significance tests on main claims",
    405       "detail": "The central claim of 26%/23% improvement over SymPrompt is reported as raw percentage point differences without any statistical test. A paired t-test is used only in the ablation to show two variants are NOT different. The main claim of superiority lacks statistical support."
    406     },
    407     {
    408       "flag": "Unfair compute comparison",
    409       "detail": "Panta uses multiple LLM iterations per class (up to maxCYC iterations, with test repair loops), while SymPrompt makes single-pass LLM calls per method. The compute difference is substantial and undiscussed. A compute-matched comparison would be more informative."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Automated unit test improvement using large language models at meta",
    415       "authors": ["Nadia Alshahwan", "Jubin Chheda", "Anastasia Finogenova", "Beliz Gokkaya", "Mark Harman", "Inna Harper", "Alexandru Marginean", "Shubho Sengupta", "Eddy Wang"],
    416       "year": 2024,
    417       "relevance": "Industry deployment of LLM-based test generation at Meta, evidence of LLM test generation in production."
    418     },
    419     {
    420       "title": "Chatunitest: A framework for llm-based test generation",
    421       "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi", "Junxiao Han", "Shuiguang Deng", "Jianwei Yin"],
    422       "year": 2024,
    423       "relevance": "LLM-based test generation framework with generation-validation-repair mechanism."
    424     },
    425     {
    426       "title": "Leveraging Large Language Models for Enhancing the Understandability of Generated Unit Tests",
    427       "authors": ["Amirhossein Deljouyi", "Roham Koohestani", "Maliheh Izadi", "Andy Zaidman"],
    428       "year": 2024,
    429       "relevance": "Combines search-based testing with LLMs to improve test understandability."
    430     },
    431     {
    432       "title": "An empirical evaluation of using large language models for automated unit test generation",
    433       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    434       "year": 2023,
    435       "relevance": "Empirical evaluation of LLM test generation in JavaScript, documents coverage limitations."
    436     },
    437     {
    438       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained Large Language Models",
    439       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    440       "year": 2023,
    441       "relevance": "Hybrid approach combining search-based testing with LLMs to overcome coverage plateaus."
    442     },
    443     {
    444       "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM",
    445       "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang", "Xiaofei Ma", "Murali Krishna Ramanathan", "Baishakhi Ray"],
    446       "year": 2024,
    447       "doi": "10.1145/3643769",
    448       "relevance": "SymPrompt: main baseline for comparison. Uses symbolic execution path constraints to guide LLM test generation."
    449     },
    450     {
    451       "title": "Coverup: Coverage-guided llm-based test generation",
    452       "authors": ["Juan Altmayer Pizzorno", "Emery D Berger"],
    453       "year": 2024,
    454       "relevance": "Coverage-guided LLM test generation using coverage reports to indicate uncovered code."
    455     },
    456     {
    457       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    458       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    459       "year": 2024,
    460       "relevance": "LLM-based test generation using method decomposition into slices."
    461     },
    462     {
    463       "title": "ASTER: Natural and Multi-language Unit Test Generation with LLMs",
    464       "authors": ["Rangeet Pan", "Myeongsoo Kim", "Rahul Krishna", "Raju Pavuluri", "Saurabh Sinha"],
    465       "year": 2025,
    466       "arxiv_id": "2409.03093",
    467       "relevance": "Multi-language LLM test generation pipeline for complex software systems."
    468     },
    469     {
    470       "title": "Using large language models to generate junit tests: An empirical study",
    471       "authors": ["Mohammed Latif Siddiq", "Joanna Cecilia Da Silva Santos", "Ridwanul Hasan Tanvir", "Noshin Ulfat", "Fahmid Al Rifat", "Vinícius Carvalho Lopes"],
    472       "year": 2024,
    473       "relevance": "Empirical study of LLM-generated JUnit tests documenting coverage challenges."
    474     },
    475     {
    476       "title": "Testeval: Benchmarking large language models for test case generation",
    477       "authors": ["Wenhan Wang", "Chenyuan Yang", "Zhijie Wang", "Yuheng Huang", "Zhaoyang Chu", "Da Song", "Lingming Zhang", "An Ran Chen", "Lei Ma"],
    478       "year": 2024,
    479       "relevance": "Benchmark for evaluating LLM test generation capabilities across programming languages."
    480     },
    481     {
    482       "title": "Large Language Models are Few-Shot Testers: Exploring LLM-Based General Bug Reproduction",
    483       "authors": ["Sungmin Kang", "Juyeon Yoon", "Shin Yoo"],
    484       "year": 2023,
    485       "doi": "10.1109/ICSE48619.2023.00194",
    486       "relevance": "LLM-based bug reproduction from bug reports, related capability for test generation."
    487     }
    488   ]
    489 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs