scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30864B)
      1 {
      2   "paper": {
      3     "title": "Knowledge Matters: Injecting Project and Testing Knowledge into LLM-based Unit Test Generation",
      4     "authors": [
      5       "Anji Li",
      6       "Mingwei Liu",
      7       "Zhenxi Chen",
      8       "Zheng Pei",
      9       "Zike Li",
     10       "Dekun Dai",
     11       "Yanlin Wang",
     12       "Zibin Zheng"
     13     ],
     14     "year": 2026,
     15     "venue": "ICSE 2026",
     16     "arxiv_id": "2511.14224",
     17     "doi": "10.1145/3744916.3787769"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval", "qualitative"],
     22   "key_findings": "KTester, a framework combining project-specific knowledge extraction and testing-domain-knowledge-guided multi-step generation, outperforms four state-of-the-art baselines across correctness and coverage metrics on the HITS dataset (110 complex Java methods). It improves execution pass rate by 5.03% and line coverage by 11.67% over the strongest baseline (HITS) while generating fewer test cases in less time. Ablation shows usage trace extraction most impacts correctness while the design/generation separation most impacts coverage. A user study with 15 participants rates KTester-generated tests highest in correctness, readability, and maintainability.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper states 'All data/code used in this study is provided in the package [9]' which references https://github.com/SYSUSELab/KTester."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The HITS dataset is a publicly available benchmark from prior work [52]. The authors also reference their replication package [9] containing data/code."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No environment specification (requirements.txt, Dockerfile, or detailed library versions) is provided in the paper. The replication package may contain this, but the paper itself does not describe the environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not include step-by-step reproduction instructions. A replication package is referenced but specific commands or procedures to replicate results are not provided in the paper text."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Despite repeating experiments 5 times, Tables 2-4 report only point estimates (e.g., '76.41' EPR) with no confidence intervals, error bars, or ± notation."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims KTester 'significantly outperforms' baselines but provides no statistical significance tests (no p-values, t-tests, or other tests). Claims of superiority are based solely on comparing raw numbers."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports improvements with baseline context, e.g., 'improving execution pass rate by 5.03% and line coverage by 11.67% over the strongest baseline.' Table 2 provides all baseline values for comparison."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for the sample size of 110 tasks or 15 user study participants. No power analysis is provided."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "The paper states 'we repeated each experiment five times and report the average results' but no standard deviation, IQR, or any spread measure is reported in Tables 2-4."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Four baselines are compared: ChatUnitTest, ChatTester, HITS (LLM-based), and UTGen (SBST+LLM hybrid). All are described in Section 3.1.2."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines are recent: HITS (ASE 2024), ChatUniTest (FSE 2024), ChatTester (2024), UTGen (ICSE 2025). These represent the current state of the art."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "RQ2 (Section 3.3) presents an ablation study with four variants (KTester-UTE, KTester-FMR, KTester-MVG, KTester-DGT), each removing one component. Results in Table 4."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Eight metrics are used: CPR, EPR, LC, BC, LCP, BCP, AvT, and AvTC, covering correctness, sufficiency, and efficiency."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "RQ3 (Section 3.4) conducts a user study with 15 participants rating tests on correctness, readability, and maintainability using a 4-point Likert scale."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The HITS benchmark (110 tasks) is used as the evaluation set. No fine-tuning is performed on this data — the LLM is used zero-shot with prompting, so no dev/test separation issue arises."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 breaks down the dataset by project (10 projects) with per-project task counts. Table 3 shows results across different LLM backends."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The paper shows motivational examples of baseline failures (Figures 1-3) and compares HITS vs KTester output (Figure 10), but does not discuss cases where KTester itself fails or produces poor tests."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "Every experiment shows KTester outperforming baselines. The ablation shows drops when removing components, confirming each helps, but no configurations that failed or approaches that were tried and abandoned are reported."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims of 5.03% EPR improvement and 11.67% LC improvement match Table 2 (76.41-71.38=5.03, 63.94-52.27=11.67). However, the body text states KTester achieves '77.07%' EPR while Table 2 shows 76.41% — a minor inconsistency."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims ('improves', 'outperforms') supported by controlled ablation experiments (RQ2) that isolate component contributions through single-variable removal. All methods use the same LLM backend, controlling for model variation."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title 'LLM-based Unit Test Generation' and abstract claim broad applicability, but evaluation is only on Java (110 methods from 10 projects). Section 5 mentions Java focus but the title and framing are broader than the evidence supports."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Section 5 (Threats to Validity) discusses Java focus and user study limitations but does not consider alternative explanations for the results, such as whether improvements come from simply providing more context to the LLM rather than the specific knowledge extraction approach."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures code coverage and execution pass rate and frames them as test quality metrics. It also separately measures readability/maintainability via human study. The claims match the granularity of measurements without inflating to broader quality claims."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The primary model is 'gpt-4o-mini' — a marketing name without a snapshot date or API version. For generalizability experiments, 'claude-3-5-haiku-20241022' (specific) and 'deepseek-v3.1' (not a precise snapshot) are used. The main experimental model lacks version specificity."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Figures 7, 8, and 9 show prompt templates with '@input{...}' placeholders (e.g., '@input{focal method and target class}'). These are templates, not actual filled prompts. The reader cannot reconstruct every prompt sent to the model from what is in the paper."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.1.2 states 'temperature=0.5' for all LLM-based methods. Section 3.2.1 mentions 'limited automatic repair iterations to 5.' While minimal, key generation parameters are stated."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The 5-step pipeline is described in detail in Section 2.2: test class framework generation, test case design, test method transformation, test class integration, and test class refinement. Knowledge extraction (Section 2.1) with AST analysis and CFG-based usage trace extraction is also thoroughly described."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3.1.1 describes the HITS dataset: 110 tasks from 10 open-source Java projects targeting methods with cyclomatic complexity > 10. Table 1 provides per-project details with versions and task counts."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 'Threats to Validity' provides substantive discussion of LLM fairness, language generality, and user study limitations."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 5 discusses specific threats: use of same model version for fairness, reliance on official implementations, Java-specific evaluation, and limited user study sampling (mentions 'results may still reflect participant bias and limited sampling')."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper mentions Java focus and suggests extending to other languages as future work, but does not explicitly state what the results do NOT show. No specific out-of-scope claims are articulated beyond vague generalization caveats."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The replication package [9] (https://github.com/SYSUSELab/KTester) is stated to contain 'all data/code used in this study.'"
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The HITS dataset is described in Section 3.1.1 with its construction criteria (cyclomatic complexity > 10), 10 source projects with versions listed in Table 1, and 110 total tasks."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 3.4.1 describes user study recruitment: 'recruited through a public invitation distributed across the computer science departments of six universities,' selecting '15 participants (5 Ph.D. students and 10 Master's students)' with 2-5 years Java experience. Compensation mentioned."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Section 2 describes the full pipeline from offline knowledge extraction through 5-step test generation. Section 3.2.1 describes the experimental procedure including 5 repetitions and metric computation using JaCoCo."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The Acknowledgments section lists NSFC Grant No. 62402113, Natural Science Foundation of Guangdong Province, Social Science Planning Project, and GMCC-SYSU Joint Lab."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All authors are affiliated with Sun Yat-sen University. They are not evaluating their own commercial product — they evaluate a research framework against other academic methods."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Funding is from Chinese national and provincial government science foundations, which have no financial interest in the specific outcome of LLM-based test generation comparisons."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interest declaration is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not state the training data cutoff for GPT-4o-mini, Claude 3.5 Haiku, or DeepSeek v3.1. These models could have seen the source code and existing tests from the benchmark projects during training."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of whether the LLMs were trained on code from the 10 open-source Java projects used in the HITS dataset. The models likely saw these projects' source code and possibly existing test suites during pre-training."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The HITS dataset uses publicly available open-source projects (e.g., Gson, Commons-Collections). These are widely indexed and likely in LLM training data. This contamination risk is not discussed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No mention of pre-registration for the user study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No mention of IRB or ethics board approval for the user study involving 15 human participants."
    262       },
    263       "demographics_reported": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Section 3.4.1 reports: '5 Ph.D. students and 10 Master's students, all with 2–5 years of Java development experience.' Minimal but present."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "The paper says 'From the volunteers, we selected 15 participants' but does not state specific selection criteria beyond Java experience. How 15 were chosen from the volunteer pool is not described."
    272       },
    273       "randomization_described": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "Section 3.4.1 states 'the presentation order was randomised' and method identities were anonymized."
    277       },
    278       "blinding_described": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "Section 3.4.1 states 'the identities of the methods were fully anonymised' — participants did not know which method generated which test class."
    282       },
    283       "attrition_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No information on whether all 15 participants completed all evaluations. Section 5 mentions '14 professional developers' while Section 3.4.1 says 15 participants — this discrepancy suggests possible unreported attrition."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 2 reports Average Time per Task (AvT): KTester takes 152.68 seconds per task. While API cost in dollars or tokens is not stated, wall-clock time per task is provided."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No total computational budget is stated — no total API spend, total GPU hours, or hardware specifications for running the experiments."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper states experiments were 'repeated each experiment five times and report the average results' but reports no variance, standard deviation, or any measure of sensitivity across runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 3.2.1 explicitly states: 'we repeated each experiment five times and report the average results.'"
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search is described. Temperature=0.5 and repair iterations=5 are stated without explanation of how these values were chosen or what alternatives were considered."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No discussion of how the final configuration was selected. Temperature=0.5 and other settings appear chosen without stated rationale or comparison against alternatives."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper makes many implicit comparisons (5 methods × 8 metrics) but performs no statistical tests at all, let alone corrections for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement their own system and compare against baseline implementations. They note they 'rely on official implementations or carefully reimplement them' but do not acknowledge the inherent bias of evaluating their own system."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Table 2 reports AvT (time per task) for all methods, enabling comparison at matched or understood compute levels. KTester (152.68s) is faster than HITS (625.69s) while achieving better results."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper uses the HITS dataset targeting methods with cyclomatic complexity > 10 but does not discuss whether this benchmark adequately measures the claimed capabilities of 'high-quality unit test generation' or whether complex methods are representative of real testing needs."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "All LLM-based methods use the same model backend (GPT-4o-mini with temperature=0.5), isolating the scaffolding/framework as the variable being compared. The different frameworks (KTester, HITS, ChatUniTest, ChatTester) are the treatments under evaluation."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether GPT-4o-mini's training data includes source code or tests from the 10 open-source projects in the HITS benchmark. These projects (e.g., Gson 2.10.1, Commons-Collections 4.5.0) existed well before LLM training."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "KTester's project knowledge extraction provides the LLM with rich context from the codebase by design. No discussion of whether this context could leak test oracle information or existing test patterns to the model."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether training data includes code from the same repositories. The 110 methods come from 10 projects that are widely used and likely well-represented in LLM training corpora."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No concrete leakage detection or prevention method is used (no canary strings, membership inference, decontamination, or temporal splits)."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "KTester improves execution pass rate by 5.03% and line coverage by 11.67% over the strongest baseline (HITS)",
    374       "evidence": "Table 2: KTester EPR=76.41% vs HITS EPR=71.38% (Δ=5.03%); KTester LC=63.94% vs HITS LC=52.27% (Δ=11.67%). Averaged over 5 runs on 110 HITS dataset tasks.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "KTester achieves 100% compile pass rate",
    379       "evidence": "Table 2 shows CPR=100% for KTester across all 110 tasks, averaged over 5 runs.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Usage trace extraction is the primary contributor to test correctness",
    384       "evidence": "Table 4: removing usage trace extraction (KTester-UTE) causes the largest correctness drop — CPR drops by 6.37% and EPR drops by 14.41%.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Modular test case transformation contributes most to test adequacy",
    389       "evidence": "Table 4: replacing design+transformation with direct generation (KTester-DGT) causes the largest coverage drops — LC drops by 13.39% and BC drops by 11.15%.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Human evaluators rate KTester-generated tests significantly higher in correctness, readability, and maintainability",
    394       "evidence": "Figure 11: KTester has highest 'Strongly Agree' proportions across all three dimensions (0.34 correctness, 0.57 readability, 0.39 maintainability). 15 participants evaluated 10 tasks.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "KTester generalizes across different LLM backends",
    399       "evidence": "Table 3: KTester implemented with Claude 3.5 Haiku and DeepSeek v3.1 achieves even better results than GPT-4o-mini on most metrics.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "The paper repeatedly claims methods 'significantly outperform' baselines but provides zero statistical tests. All comparisons are based on raw number comparisons from averaged results with no reported variance, making it impossible to determine whether differences are statistically meaningful."
    407     },
    408     {
    409       "flag": "No variance reported despite 5 runs",
    410       "detail": "Experiments are repeated 5 times but only averages are reported — no standard deviation, IQR, or confidence intervals. This hides potentially large run-to-run variation, especially given LLM output stochasticity at temperature=0.5."
    411     },
    412     {
    413       "flag": "Internal numeric inconsistency",
    414       "detail": "The body text (Section 3.2.2) states KTester achieves '77.07%' EPR, but Table 2 shows 76.41%. Section 5 refers to '14 professional developers' while Section 3.4.1 describes 15 participants. These inconsistencies suggest careless editing."
    415     },
    416     {
    417       "flag": "Contamination risk unaddressed",
    418       "detail": "The benchmark uses popular open-source Java projects (Gson, Commons-Collections, etc.) that are almost certainly in GPT-4o-mini's training data. Existing tests for these projects may also be in the training data, giving the LLM prior knowledge that is not controlled for."
    419     },
    420     {
    421       "flag": "Small user study with no statistical analysis",
    422       "detail": "15 participants (all students from CS departments) evaluate 10 tasks with no statistical tests on the Likert ratings. Results are presented as proportions only. No inter-rater reliability is reported."
    423     },
    424     {
    425       "flag": "Only successes shown",
    426       "detail": "The paper shows no cases where KTester fails or produces poor tests. All qualitative examples and quantitative results highlight KTester's advantages. No analysis of failure modes or conditions where the approach underperforms."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "ChatUniTest: A Framework for LLM-Based Test Generation",
    432       "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi", "Junxiao Han", "Shuiguang Deng", "Jianwei Yin"],
    433       "year": 2024,
    434       "doi": "10.1145/3663529.3663801",
    435       "relevance": "LLM-based test generation framework that serves as a key baseline for evaluating KTester."
    436     },
    437     {
    438       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    439       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    440       "year": 2024,
    441       "doi": "10.1145/3691620.3695501",
    442       "relevance": "State-of-the-art LLM-based test generation using method slicing, serves as the strongest baseline."
    443     },
    444     {
    445       "title": "Evaluating and Improving ChatGPT for Unit Test Generation",
    446       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu", "Shiji Ding", "Kaixin Wang", "Yixuan Chen", "Xin Peng"],
    447       "year": 2024,
    448       "doi": "10.1145/3660783",
    449       "relevance": "ChatTester evaluation of ChatGPT for test generation, baseline method in this study."
    450     },
    451     {
    452       "title": "Leveraging Large Language Models for Enhancing the Understandability of Generated Unit Tests",
    453       "authors": ["Amirhossein Deljouyi", "Roham Koohestani", "Maliheh Izadi", "Andy Zaidman"],
    454       "year": 2025,
    455       "doi": "10.1109/ICSE55347.2025.00032",
    456       "relevance": "UTGen hybrid SBST+LLM approach that uses LLMs to improve test readability, serves as a baseline."
    457     },
    458     {
    459       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-Trained Large Language Models",
    460       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    461       "year": 2023,
    462       "doi": "10.1109/ICSE48619.2023.00085",
    463       "relevance": "Hybrid approach integrating LLM-generated test seeds with evolutionary algorithms for coverage-driven test generation."
    464     },
    465     {
    466       "title": "An empirical evaluation of using large language models for automated unit test generation",
    467       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    468       "year": 2023,
    469       "doi": "10.1109/TSE.2023.3334955",
    470       "relevance": "TestPilot system incorporating documentation and usage examples for LLM-based test generation."
    471     },
    472     {
    473       "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM",
    474       "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang"],
    475       "year": 2024,
    476       "doi": "10.1145/3643769",
    477       "relevance": "SymPrompt — uses symbolic execution to steer LLM prompting for test generation."
    478     },
    479     {
    480       "title": "CoverUp: Effective High Coverage Test Generation for Python",
    481       "authors": ["Juan Altmayer Pizzorno", "Emery D. Berger"],
    482       "year": 2025,
    483       "doi": "10.1145/3729398",
    484       "relevance": "Coverage-guided dialogue approach for refining LLM-generated tests."
    485     },
    486     {
    487       "title": "ASTER: Natural and Multi-Language Unit Test Generation with LLMs",
    488       "authors": ["Rangeet Pan", "Myeongsoo Kim", "Rahul Krishna", "Raju Pavuluri", "Saurabh Sinha"],
    489       "year": 2025,
    490       "doi": "10.1109/ICSE-SEIP66354.2025.00042",
    491       "relevance": "Multi-language LLM-based test generation system with multi-step reasoning."
    492     },
    493     {
    494       "title": "exLong: Generating exceptional behavior tests with large language models",
    495       "authors": ["Jiyang Zhang", "Yu Liu", "Pengyu Nie", "Junyi Jessy Li", "Milos Gligoric"],
    496       "year": 2025,
    497       "doi": "10.1109/ICSE55347.2025.00176",
    498       "relevance": "Fine-tuned CodeLlama for exception-oriented test generation, relevant to LLM test generation capabilities."
    499     },
    500     {
    501       "title": "TestART: Improving LLM-based Unit Testing via Co-evolution of Automated Generation and Repair Iteration",
    502       "authors": ["Siqi Gu", "Quanjun Zhang", "Kecheng Li", "Chunrong Fang"],
    503       "year": 2025,
    504       "arxiv_id": "2408.03095",
    505       "doi": "10.48550/arXiv.2408.03095",
    506       "relevance": "Co-evolution approach for LLM test generation and repair, uses similar evaluation metrics."
    507     },
    508     {
    509       "title": "Unit test case generation with transformers and focal context",
    510       "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy", "Shao Kun Deng", "Neel Sundaresan"],
    511       "year": 2020,
    512       "doi": "10.48550/arXiv.2009.05617",
    513       "relevance": "AthenaTest — pioneering transformer-based test generation using focal context."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "Developers could adopt the knowledge-extraction + multi-step generation approach; code is released on GitHub."
    520     },
    521     "surprise_contrarian": {
    522       "score": 0,
    523       "justification": "Confirms the expected intuition that more context and structured generation improve LLM test output — not contrarian."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No safety or security concerns raised; the paper is about improving test generation quality."
    528     },
    529     "drama_conflict": {
    530       "score": 0,
    531       "justification": "No controversy — straightforward incremental improvement over existing methods."
    532     },
    533     "demo_ability": {
    534       "score": 2,
    535       "justification": "Code and data are released at https://github.com/SYSUSELab/KTester; a practitioner could try it on their Java projects."
    536     },
    537     "brand_recognition": {
    538       "score": 1,
    539       "justification": "Published at ICSE (top venue) but from Sun Yat-sen University, not a widely recognized AI lab."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs