scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33191B)
      1 {
      2   "paper": {
      3     "title": "LLM-based Unit Test Generation via Property Retrieval",
      4     "authors": [
      5       "Zhe Zhang",
      6       "Xingyu Liu",
      7       "Yuanzhang Lin",
      8       "Xiang Gao",
      9       "Hailong Sun",
     10       "Yuan Yuan"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2410.13542",
     15     "doi": "10.48550/arXiv.2410.13542"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "APT, a property-based retrieval augmentation tool for unit test generation, achieves 60.2% successful test execution and 54.2% full coverage across 1515 methods in 12 Java projects, outperforming ChatUnitTest (24.0%/21.3%) and vanilla LLM-based RAG (15.4%/13.2%). The iterative strategy component consistently reduces compilation and runtime errors across all projects. Generated tests show significantly fewer code style violations (87 vs 413 for ChatUnitTest) and mock objects (8 vs 61) than baselines.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper describes APT as implemented in 8,000 lines of Python (Section 5) but provides no repository URL, Zenodo archive, or any link to released source code."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper uses 12 open-source projects (4 from HITS/ChatUniTest datasets, 8 crawled from GitHub) but does not release the specific dataset of 1515 methods, the extracted property relationships, or evaluation artifacts."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper does not list library versions or dependency information needed to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a conceptual level but lacks the specifics needed to replicate the experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 2 and 4 report only point estimates (e.g., '60.2%', '25.9%') with no confidence intervals, error bars, or uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims APT 'significantly outperforms' baselines (e.g., Section 5.1.1) but provides no statistical tests (no p-values, t-tests, or any significance testing). All comparisons are based on raw percentage differences."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Results in Table 2 provide both absolute performance numbers and baseline context (e.g., APT 60.2% successful executions vs ChatUnitTest 24.0%, LLM with RAG 15.4%, EvoSuite 26.7%), allowing the reader to assess effect magnitude."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The sample of 1515 methods across 12 projects is described but not justified. No power analysis or rationale for why this sample size is sufficient. Project selection criteria (150+ stars, updated recently) are stated but the adequacy of 12 projects is not discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers with no indication of stability."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares APT against ChatUnitTest, LLM with default RAG (DeepSeek-V2.5), and EvoSuite (Section 5.1). GitHub Copilot is additionally used as a baseline for the maintainability evaluation (Section 5.2)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "ChatUnitTest (2023) and HITS (2024, attempted but had runtime issues) are contemporary. EvoSuite (latest version 1.0.6) is the standard SBST baseline. The authors note HITS was unusable due to runtime issues, providing justification for its exclusion."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 4 presents a detailed ablation of the Iterative Strategy (IS) component, showing performance with and without IS across all 12 projects on four metrics. However, no ablation is provided for the property retrieval component itself."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper evaluates correctness (compilation/run errors, assertion errors, successful execution), completeness (full coverage), and maintainability (code style violations via CheckStyle, mock density via PMD) — multiple distinct metrics across different quality dimensions."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "The 5 graduate/doctoral researchers in Section 5.4 manually analyze property relationship patterns, but this is a qualitative analysis of the approach's internal behavior, not a human evaluation of the generated test quality. No humans evaluate the actual test outputs for correctness, readability, or practical usefulness."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No separation between development and evaluation projects is described. The same 12 projects appear to have been used for both developing and evaluating the approach, with no held-out set."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Figure 11 and Table 4 provide per-project breakdowns of all metrics across all 12 projects, revealing significant variation (e.g., commons-dbutils at 91.7% full coverage vs datafaker at 21.4%)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.1.1 discusses that 'projects with abstract or interface-heavy code pose significant challenges for all tools' and specifically analyzes the datafaker and ice4j projects as challenging cases. The abstract class instantiation failure mode is identified."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "APT outperforms all baselines on every metric in every comparison. No experiments or design choices are reported as having failed or hurt performance. The only ablation (IS) shows consistent improvement, and every project-level comparison favors APT."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims APT 'consistently outperforms existing tools in terms of correctness, completeness, and maintainability.' Table 2 (correctness/completeness), Figure 12 and Table 3 (maintainability) support these claims within the tested scope."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper uses causal language ('improves', 'enhances', 'significantly improves the correctness') attributing improvement to property-based retrieval, but the ablation study only isolates the iterative strategy component. There is no ablation of the core property retrieval mechanism vs. a simpler retrieval approach, leaving the primary causal claim unsupported by controlled comparison."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'LLM-based Unit Test Generation via Property Retrieval' and abstract claim of 'valuable insights and potential applications for other code-related tasks' extend well beyond the tested scope. All experiments are on Java projects with a single LLM (DeepSeek-V2.5), but the paper does not bound claims to Java or to this specific model."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Section 6.1 acknowledges LLM and project diversity as threats but does not discuss alternative explanations for the observed improvements. For example, providing more context (regardless of property structure) or the test bundle mechanism (not the property relationships) could explain performance gains. No confound analysis is presented."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper explicitly defines its metrics: 'correct rate (syntactically correct, compilable, and runnable)' and 'passing rate (accurately reflect the requirements)' citing Yu et al. [55]. Maintainability is measured via CheckStyle violations and mock density, with explicit definitions. The measurements reasonably match the granularity of the claims."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "DeepSeek-V2.5 is specified as the LLM (Section 5.1), and EvoSuite version 1.0.6 is stated. DeepSeek-V2.5 identifies a specific model release. However, no version is specified for GitHub Copilot used in the maintainability comparison."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Figure 10 shows a 'formal version of the prompt' but it is pseudocode, not the actual prompt text sent to the LLM. The paper references 'property_prompt' and 'fallback_prompt' in Algorithm 1 without providing their actual content. The ChatTester-adapted prompt for the baseline is also not provided."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No temperature, top-p, max tokens, or other LLM API parameters are reported. Only task-level parameters are mentioned: 'a maximum of two repair rounds' and default N=3 for property retrieval. EvoSuite's 120-second search budget is stated."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The APT pipeline is described in detail in Section 4 with Figure 6 showing the workflow: Metainfo Builder, Test Case Analyzer, Property Analyzer, UT Generator with iterative strategy. Algorithm 1 formalizes the generation process. The preprocessing, retrieval, ranking, and generation stages are all documented."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4.1 describes AST parsing, metainfo extraction, scope graph construction, and reference resolution. Section 5.1 documents filtering: 'Private methods and methods with only one line of effective code are filtered out. For methods with nested or anonymous inner classes, only the outer method is tested.' Project selection criteria are stated."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6.1 'Threats to Validity' provides a dedicated discussion of validity threats including LLM choice, project diversity, and dependence on existing test coverage."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6.1 identifies threats specific to this study: 'Due to budget constraints, we utilize DeepSeek V2.5' and 'The presence of existing unit tests can affect APT's performance, as it relies on these tests to establish property relationships and guide the generation process.' These are specific to APT's design."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound findings to Java, to DeepSeek-V2.5, or to projects with pre-existing test suites. Section 6.2 instead expands scope by suggesting extension to 'other code-related tasks' without acknowledging current boundaries."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data is released. The generated test cases, property relationship analyses, compilation logs, and per-method results are not available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.1 describes the dataset: 4 projects from HITS/ChatUniTest datasets, 8 crawled from GitHub, with selection criteria (150+ stars, updated within last month, various domains including utilities, parsers, and network protocols). 1515 focal methods are tested."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "For the human analysis in Section 5.4.1: '5 graduate and doctoral researchers, all majoring in software engineering with relevant expertise, are selected as participants.' Training and cross-validation procedures are described. For projects: selection criteria follow HITS [47] methodology."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper does not document the full pipeline from project collection to final analysis. Key filtering counts are missing: how many total methods existed before filtering private methods and one-liners? How many projects were considered before selecting 12? The jump from '12 projects' to '1515 methods' lacks intermediate stage counts."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source, grants, or sponsorship information is disclosed anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All six authors are clearly listed as affiliated with Beihang University, China. The paper evaluates their own tool APT, not a product from their employer."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence of funding cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not state the training data cutoff date for DeepSeek-V2.5. The 12 open-source projects are from GitHub and could be in the model's training data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether DeepSeek-V2.5 may have seen the source code or existing test cases of the 12 evaluation projects during training. Given these are popular GitHub repositories (150+ stars), overlap is plausible."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The evaluation projects are well-known open-source repositories (e.g., jsoup, commons-collections, Redisson) that are widely available online. No contamination analysis is performed despite the high likelihood that DeepSeek-V2.5's training data includes these repositories."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "The paper has no human subjects study. The 5 researchers in Section 5.4 serve as expert annotators for qualitative analysis, not as study participants."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human subjects study is conducted. The expert annotation task does not constitute human subjects research."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human subjects study. The annotators are described as 'graduate and doctoral researchers majoring in software engineering' but this is expertise qualification, not demographic reporting for a human study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human subjects study is conducted."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human subjects study is conducted."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human subjects study is conducted."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human subjects study is conducted."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported for APT or any of the LLM-based baselines. The paper mentions 'budget constraints' for choosing DeepSeek-V2.5 but does not quantify actual costs."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget is stated. EvoSuite's 120-second per-method budget is mentioned, but no equivalent time or compute figure is given for APT or the LLM-based tools."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No multi-seed analysis is reported. LLM outputs are stochastic, but the paper does not report results across multiple random seeds or temperature settings."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper states 'This process is repeated multiple times' (Section 5.1) but does not specify the exact number of runs per focal method. The maximum of two repair rounds is mentioned but the total generation attempts are not quantified."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. The default N=3 for property retrieval is mentioned but how this value was chosen is not discussed."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Configuration parameters (N=3 default, 2 repair rounds, ranking order) are stated but not justified through systematic selection. No validation set or configuration comparison is presented."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper makes numerous comparisons across 12 projects and 4 tools without any statistical framework."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own APT tool against their own re-runs of baselines without acknowledging potential bias. No independent evaluation or discussion of self-comparison bias is present, despite Lucic et al. (2018) showing this is a systematic issue."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "APT's multi-stage pipeline (AST parsing, property analysis, LLM calls for analysis + generation + repair) likely uses substantially more compute than the simpler baselines, but no compute-matched comparison is provided."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper does not discuss whether compilation success, assertion passing, and code coverage truly measure 'correctness, completeness, and maintainability' as claimed. Despite citing Yu et al. [55] on practitioner expectations, no analysis of construct validity for these proxy metrics is provided."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "APT uses a complex multi-stage pipeline (metainfo extraction, property analysis, test bundle retrieval, iterative generation) while baselines use simpler approaches. The performance difference could be due to the richer scaffolding rather than the property-based retrieval specifically. No scaffold-matched comparison is provided."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether DeepSeek-V2.5's training data includes the evaluation projects or their test suites. The open-source projects (e.g., jsoup, commons-collections) predate the model and their tests could be in the training data."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether providing existing test cases in the prompt constitutes a form of information leakage in the evaluation. If the model already knows these tests from training, the benefit of retrieval may be overstated."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Methods within the same project are not independent (they share code style, patterns, and dependencies), but this non-independence is not discussed. Results are reported per-method without clustering by project."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis is performed."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "APT achieves 60.2% successful test execution rate, significantly outperforming ChatUnitTest (24.0%), LLM with default RAG (15.4%), and EvoSuite (26.7%).",
    372       "evidence": "Table 2 in Section 5.1.1 shows compilation/run errors, assertion errors, successful executions, and full coverage across 1515 methods for all tools.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "APT achieves 54.2% full coverage rate, outperforming all baselines (ChatUnitTest 21.3%, LLM with RAG 13.2%, EvoSuite 25.4%).",
    377       "evidence": "Table 2, Section 5.1.1. Per-project breakdown in Figure 11 confirms the trend across most projects.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "APT generates tests with significantly fewer code style violations than ChatUnitTest and GitHub Copilot.",
    382       "evidence": "Figure 12, Section 5.2: APT has 87 total violations vs ChatUnitTest 413 and Copilot 292, measured on 167 commonly fully-covered methods using CheckStyle.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "APT introduces only 8 additional mocks compared to ChatUnitTest's 61 and Copilot's 42.",
    387       "evidence": "Table 3, Section 5.2, using PMD for mock detection on the same 167 test methods.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The Iterative Strategy improves test quality by reducing errors and enhancing both coverage success and overall coverage.",
    392       "evidence": "Table 4 shows with/without IS comparison across all 12 projects. Compilation errors decrease (e.g., binance 22.6%→12.3%, jsoup 26.2%→11.3%) and full coverage increases across all projects.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Six property relationship patterns (structural similarity, behavioral similarity, substitutability, exception handling similarity, resource access similarity, dependency) are identified across methods.",
    397       "evidence": "Table 5, Section 5.4.2. 300 methods randomly sampled and independently reviewed by 5 researchers with cross-validation.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No statistical significance testing",
    404       "detail": "All claims of superiority ('significantly outperforms', 'consistently outperforms') are based on raw percentage comparisons without any statistical tests. With 12 heterogeneous projects and stochastic LLM outputs, the observed differences could be within natural variance."
    405     },
    406     {
    407       "flag": "No code or data released",
    408       "detail": "APT is described as 8,000 lines of Python code but nothing is released. The evaluation dataset, generated tests, property relationships, and compilation logs are all unavailable for independent verification."
    409     },
    410     {
    411       "flag": "Convenient baseline exclusion",
    412       "detail": "HITS (2024), the most contemporary and directly comparable baseline, is excluded due to 'significant runtime issues' making it 'unusable for generating unit tests at the method level.' This conveniently removes the strongest competitor without independent verification of the claimed incompatibility."
    413     },
    414     {
    415       "flag": "Training data contamination risk",
    416       "detail": "The 12 evaluation projects are popular GitHub repositories (150+ stars) likely present in DeepSeek-V2.5's training data. Since APT retrieves and provides existing test cases from these repos, the model may already know these tests, conflating memorization with the benefit of property-based retrieval."
    417     },
    418     {
    419       "flag": "Incomplete ablation",
    420       "detail": "Only the Iterative Strategy component is ablated (Table 4). The core innovation — property-based retrieval — is never ablated. There is no comparison of APT with property retrieval vs APT with simpler retrieval (e.g., random test case selection, BM25 similarity), so the causal contribution of property relationships is unestablished."
    421     },
    422     {
    423       "flag": "Unequal compute across tools",
    424       "detail": "APT runs a multi-stage pipeline (AST parsing, LLM-based property analysis, test bundle extraction, LLM-based generation, iterative repair) while baselines are simpler. No compute-matched comparison ensures that the improvement isn't simply from using more LLM calls and more sophisticated preprocessing."
    425     },
    426     {
    427       "flag": "Asymmetric EvoSuite comparison",
    428       "detail": "EvoSuite is tested on only 389 of 1515 methods due to Java 8 / JUnit 4 limitations, making this a subset comparison. The 389 methods likely differ systematically from the full set, potentially favoring or disfavoring EvoSuite in ways that are not analyzed."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool",
    434       "authors": ["Zhuokui Xie", "Yinghao Chen", "Chen Zhi", "Shuiguang Deng", "Jianwei Yin"],
    435       "year": 2023,
    436       "relevance": "Primary baseline — LLM-based unit test generation tool using ChatGPT with adaptive focal context."
    437     },
    438     {
    439       "title": "No more manual tests? evaluating and improving chatgpt for unit test generation",
    440       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"],
    441       "year": 2023,
    442       "relevance": "ChatTester tool that refines LLM-generated test cases, demonstrating LLMs can outperform SBST for unit testing."
    443     },
    444     {
    445       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    446       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    447       "year": 2024,
    448       "relevance": "Contemporary LLM unit test tool that decomposes focal methods into slices for coverage improvement — excluded baseline."
    449     },
    450     {
    451       "title": "Enhancing LLM-based Test Generation for Hard-to-Cover Branches via Program Analysis",
    452       "authors": ["C. Yang", "J. Chen", "B. Lin"],
    453       "year": 2024,
    454       "arxiv_id": "2404.04966",
    455       "relevance": "TELPA hybrid approach combining program analysis with LLMs for test generation of hard-to-cover branches."
    456     },
    457     {
    458       "title": "TestART: Improving LLM-based Unit Test via Co-evolution of Automated Generation and Repair Iteration",
    459       "authors": ["Siqi Gu", "Chunrong Fang", "Quanjun Zhang"],
    460       "year": 2024,
    461       "arxiv_id": "2408.03095",
    462       "relevance": "LLM-based test generation with iterative repair, directly related to the repair mechanism used in APT."
    463     },
    464     {
    465       "title": "Evosuite: automatic test suite generation for object-oriented software",
    466       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    467       "year": 2011,
    468       "relevance": "Foundational search-based test generation tool used as a baseline in the evaluation."
    469     },
    470     {
    471       "title": "Unit test case generation with transformers and focal context",
    472       "authors": ["M. Tufano", "D. Drain", "A. Svyatkovskiy"],
    473       "year": 2020,
    474       "arxiv_id": "2009.05617",
    475       "relevance": "AthenaTest — early deep learning approach to unit test generation using transformer models and focal context."
    476     },
    477     {
    478       "title": "MASAI: Modular Architecture for Software-engineering AI Agents",
    479       "authors": ["Daman Arora", "Atharv Sonwane", "Nalin Wadhwa"],
    480       "year": 2024,
    481       "arxiv_id": "2406.11638",
    482       "relevance": "Modular AI agent architecture for software engineering that designs specific retrieval tools."
    483     },
    484     {
    485       "title": "Autocoderover: Autonomous program improvement",
    486       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    487       "year": 2024,
    488       "relevance": "AST-based retrieval approach for autonomous code modification, related to retrieval-augmented code generation."
    489     },
    490     {
    491       "title": "Swe-bench: Can language models resolve real-world github issues?",
    492       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    493       "year": 2023,
    494       "arxiv_id": "2310.06770",
    495       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks, uses BM25 similarity retrieval."
    496     },
    497     {
    498       "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    499       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"],
    500       "year": 2023,
    501       "relevance": "Hybrid approach combining LLMs with search-based testing to escape coverage plateaus."
    502     },
    503     {
    504       "title": "Practitioners' Expectations on Automated Test Generation",
    505       "authors": ["Xiao Yu", "Lei Liu", "Xing Hu", "Jacky Keung", "Xin Xia", "David Lo"],
    506       "year": 2024,
    507       "relevance": "Empirical study of what practitioners value in test generation tools — cited to motivate APT's focus on correctness and maintainability over coverage."
    508     },
    509     {
    510       "title": "ChatGPT vs SBST: A comparative assessment of unit test suite generation",
    511       "authors": ["Yutian Tang", "Zhijie Liu", "Zhichao Zhou", "Xiapu Luo"],
    512       "year": 2024,
    513       "relevance": "Comparative study of ChatGPT vs search-based testing for unit test generation, whose methodology (CheckStyle, PMD) is adopted in APT's maintainability evaluation."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "APT addresses a real developer need (unit test generation) with a concrete tool, but it is not released and requires significant setup including AST parsing infrastructure."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "The insight that existing tests can guide new test generation via property relationships is novel but not deeply surprising — it confirms that more context helps LLMs."
    524     },
    525     "fear_safety": {
    526       "score": 0,
    527       "justification": "No AI safety, security, or risk implications."
    528     },
    529     "drama_conflict": {
    530       "score": 0,
    531       "justification": "No controversy or conflict; straightforward tool evaluation paper."
    532     },
    533     "demo_ability": {
    534       "score": 0,
    535       "justification": "No code released, no demo, no way to try the tool."
    536     },
    537     "brand_recognition": {
    538       "score": 0,
    539       "justification": "From Beihang University; no well-known AI lab or product branding."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs