scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28708B)
      1 {
      2   "paper": {
      3     "title": "CITYWALK: Enhancing LLM-Based C++ Unit Test Generation via Project-Dependency Awareness and Language-Specific Knowledge",
      4     "authors": [
      5       "Yuwei Zhang",
      6       "Qingyuan Lu",
      7       "Kai Liu",
      8       "Wensheng Dou",
      9       "Jiaxin Zhu",
     10       "Li Qian",
     11       "Chunxi Zhang",
     12       "Zheng Lin",
     13       "Jun Wei"
     14     ],
     15     "year": 2025,
     16     "venue": "ACM Transactions on Software Engineering and Methodology",
     17     "arxiv_id": "2501.16155",
     18     "doi": "10.1145/3763791"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'We publicly release the artifacts [53] of CITYWALK on Zenodo to facilitate the reproduction' (Section 1). Reference [53] points to https://zenodo.org/records/14022506."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark of 1288 focal methods across ten C++ projects is based on publicly available GitHub repositories whose URLs are provided (footnotes 4-13). The Zenodo artifact package [53] is described as a 'Replicate Package' which facilitates reproduction."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions using Clang for parsing and llvm-cov for coverage, and Python for implementation, but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions. No specific Python version, library versions, or dependency specifications are given."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While a Zenodo artifact is released, the paper itself does not contain step-by-step reproduction instructions. The methodology is described in detail but there is no 'Reproducing Results' section or explicit commands to run."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results are reported as point estimates (e.g., '83.39% CSR', '73.35% EPR') without any confidence intervals, error bars, or uncertainty quantification."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims CITYWALK 'outperforms' all baselines and 'surpasses the best baseline, GPT-4o, by 51.55% in CSR' but provides no statistical significance tests (no p-values, t-tests, or similar). Differences are compared by raw numbers only."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports effect sizes in terms of absolute percentage improvements with baseline context. For example, 'CITYWALK surpasses the best baseline, GPT-4o, by 51.55% in CSR, 43.76% in ERP, 24.78% in CovL, and 21.55% in CovB' (Section 4.1.1). Ablation results show per-component degradation values (Tables 14-15)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The benchmark comprises 1288 focal methods from 10 projects, but no justification is given for why this sample size is sufficient for the claims made. No power analysis or sample size rationale is discussed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All experiments use greedy decoding (temperature=0) producing deterministic outputs, and results are reported from single runs. No variance, standard deviation, or spread across multiple runs is reported."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Seven baselines are compared: two open-source code LLMs (CodeGeeX4, DeepSeek-V3), two closed-source LLMs (GPT-3.5, GPT-4o), and three LLM-based unit test generation approaches (ChatTester, HITS, TestPilot). Results in Tables 4-7."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include DeepSeek-V3 (2024), GPT-4o (2023/2024), CodeGeeX4, and recent approaches like HITS (ASE 2024), ChatTester (FSE 2024), and TestPilot (TSE 2024). The paper notes that Coyote C++ and CITRUS are unavailable for reproduction (Section 3.3)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 4.2 presents comprehensive ablation studies removing one component at a time (Depc, Depd, Contextintent, PROMPTstep, GuidelineDK, Fixrule+prompt). Results shown in Tables 14-15. Additionally, phase-by-phase error-fixing ablation in Tables 16-17."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Four evaluation metrics are used: Compilation Success Rate (CSR), Execution Pass Rate (EPR), Line Coverage (CovL), and Branch Coverage (CovB). Additionally, mutation scores and human evaluation scores are reported."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 5.3 describes a human evaluation with five participants (each with >3 years C++ experience) who scored 150 test cases across four aspects: Naming Intuitiveness, Code Layout, Assertion Quality, and Adoption Efforts. Results in Table 20."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Four of the ten projects (json.cpp, glomap, papy, mlx) were created after GPT-4o's training cutoff (October 2023), serving as a held-out test set to address data leakage concerns (Section 3.2, Table 3 'Trained?' column)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down per-project across all ten C++ projects in Tables 4-7, Tables 10-13, and Tables 14-15. Error categories are broken down in Figure 8 and Table 1."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 4.1.2 provides a detailed 'Bad Case Breakdown' analyzing 1567 errors across 1021 failed test cases. Error categories include Access Error, Undefined Symbols, Syntax Error, etc. (Figure 8). Section 5.4 discusses false-positive executable test cases."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that CITYWALK performs poorly on leveldb due to heavy external interactions (disk I/O, synchronization locks) in Section 4.1.2. It also acknowledges that 36.66% of generated test cases still fail (Table 8). Section 5.4 discusses the false positive problem."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims CITYWALK 'outperforms current state-of-the-art approaches on a collection of ten popular C++ projects.' Tables 4-7 confirm CITYWALK achieves the best average scores across all four metrics on all ten projects."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper makes causal claims through ablation studies (e.g., 'removing Depc leads to 29.49% drop in CovL'). The ablation design follows controlled single-variable manipulation: removing one component at a time while keeping others fixed (Section 4.2). This is adequate for causal claims about component contributions."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly scopes to C++ unit test generation and acknowledges limitations: 'Future work will involve expanding the benchmark and integrating additional LLMs to better assess the generalizability' (Section 5.5). The title and claims are specific to C++."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 5.5 (Threats to Validity) discusses alternative explanations including data leakage ('GPT-4o is a closed-source model, the exact composition of its training data are not publicly disclosed'), prompt sensitivity effects, and the possibility that configuration dependency information alone could improve baselines (Section 4.1.1, finding 2)."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper says 'we employ the gpt-4o version from the GPT family' and 'gpt-3.5-turbo-0125' (Section 4.1). For GPT-4o, no specific snapshot date or API version is given — just 'gpt-4o'. DeepSeek-V3 and CodeGeeX4 are named without version identifiers. Marketing names without snapshot dates do not count."
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Figure 5 provides the detailed prompting content for generating initial test cases, including the full prompt template with task definition, step-by-step instructions, and output format. Figure 6 shows the error-fixing prompt. Table 2 lists the domain knowledge guidelines. While some fields are placeholders (e.g., {method_name}), the actual structure and instructions are complete enough to reconstruct prompts."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.5 states: 'maximum output limit of 4096 tokens', 'greedy decoding', 'top-1 chat completion choice', 'sampling temperature to 0', and 'zero-shot setting'. Maximum prompt length is 14733 tokens (footnote 14)."
    154       },
    155       "scaffolding_described": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The agentic scaffolding is described in detail across Sections 2.1-2.6: five-stage pipeline with project pre-processing, dependency extraction, intention context retrieval via RAG, empirical observation-based knowledge, and three-step generation with three-phase post-processing. Algorithm 1 provides pseudocode. Figures 2, 5, and 6 show workflow and prompting structure."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 2.2 describes preprocessing: AST parsing with Clang, structured focal context extraction, knowledge base construction with text segmentation and BGE embedding. Section 3.2 documents project selection criteria (>50 GitHub stars, actively maintained, spanning application domains, containing complex methods). The filtering from repository to focal methods is documented."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5.5 'Threats to Validity' provides a dedicated subsection discussing external and internal threats."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5.5 discusses specific threats: the diversity of only ten C++ projects, GPT-4o's closed training data creating data leakage risk, LLM sensitivity to prompt configuration, single-round iterative fixes versus multi-round in original baselines, and the limitation to specific LLMs. These are specific to this study rather than generic disclaimers."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The paper explicitly states scope boundaries: it focuses on C++ (not other languages), uses ten projects with specific selection criteria, excludes larger projects 'to effectively manage token costs within our limited budget' (Section 3.2), and notes that Coyote C++ and CITRUS are unavailable for comparison. Section 5.5 explicitly notes the need to expand to more LLMs and projects."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The Zenodo replicate package [53] is released. The benchmark projects are all publicly available on GitHub with URLs provided. The raw experimental data (test cases, error messages) can be regenerated from the released artifacts."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 3.2 describes the benchmark collection: projects sourced from GitHub, four from prior C++ testing studies, two practical projects (ninja, leveldb), four post-cutoff projects. Selection criteria include >50 stars, active maintenance, diverse application domains, and complex methods (cyclomatic complexity >10)."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "For the human evaluation (Section 5.3), five participants 'each with over three years of C++ development experience' are mentioned, but recruitment methods are not described — how they were found, whether they are students/colleagues, or potential selection bias."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The pipeline is documented across Sections 2-3: project cloning, AST parsing, focal method extraction (1288 methods), knowledge base construction, dependency analysis, test generation, compilation/execution, coverage measurement with llvm-cov. The flow from input to output is traceable."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The Acknowledgments section discloses funding: 'National Key R&D Program of China (Grant No. 2024YFB4505902), the Major Project of ISCAS (Grant No. ISCAS-ZD-202302), the Basic Research Project of ISCAS (Grant No. ISCAS-JCZD-202403), the Youth Innovation Promotion Association of the Chinese Academy of Sciences (Grant Nos. Y2022044 and 2023121).'"
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are clearly listed: Institute of Software, Chinese Academy of Sciences; University of Chinese Academy of Sciences; and Shanghai Stock Exchange Technology Co., Ltd. Some authors are affiliated with the Shanghai Stock Exchange, but the paper evaluates open-source tools rather than a proprietary product."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The funders are Chinese government research programs (National Key R&D Program, CAS) and institutional grants. These are general research funding sources with no apparent financial stake in whether CITYWALK outperforms specific baselines."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement is present in the paper. Some authors are from Shanghai Stock Exchange Technology Co., which could have commercial interest in automated testing tools. Absence of disclosure is noted."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The paper states GPT-4o's 'knowledge cutoff date (October 2023)' in Section 3.2 when justifying the inclusion of four post-cutoff projects."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 3.2 explicitly addresses this: 'To mitigate potential data leakage concerns, we include four projects — json.cpp, glomap, papy, and mlx — all created after the GPT-4o knowledge cutoff date (October 2023). This ensures that GPT-4o was not trained on these projects.' Table 3 includes a 'Trained?' column."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "The paper directly addresses benchmark contamination by including post-cutoff projects and comparing results between potentially contaminated (6 projects, 'Y') and clean (4 projects, 'N') subsets. The mutation testing analysis in Section 5.1 shows json.cpp (post-cutoff) achieves higher mutation scores than tinyxml2 (pre-cutoff), providing 'evidence that CITYWALK's ability to generate high-quality test cases is primarily driven by the designed components as guidance, rather than relying on GPT-4o's memorization.'"
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The human evaluation study (Section 5.3) is not pre-registered. No link to any pre-registration platform is provided."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The human evaluation involved five participants scoring test cases (Section 5.3), but no IRB or ethics board approval is mentioned."
    254       },
    255       "demographics_reported": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "Only minimal characterization is provided: 'five participants, each with over three years of C++ development experience.' No further demographics (roles, education level, industry/academia, gender, geographic distribution) are reported."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "The only stated criterion is '>3 years of C++ development experience.' No formal inclusion/exclusion criteria, screening process, or selection rationale is described."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "This is not an experimental study comparing human groups. All participants evaluated the same set of test cases. Randomization of condition assignment is not applicable."
    269       },
    270       "blinding_described": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Section 5.3 states: 'To avoid bias, participants are not informed about the source of each test case (i.e., whether it is LLM-generated or human-written).'"
    274       },
    275       "attrition_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No information is provided about whether all five participants completed all evaluations or if any dropped out."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Table 19 reports average execution time per focal method (34.59s for CITYWALK) and average token usage (5726 tokens). Section 5.2 notes 'the incremental cost per method is approximately $0.03 higher than the best baseline TestPilot.' Section 3.2 mentions 'token costs within our limited budget.'"
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "While per-method costs are reported, the total computational budget (total API spend, total GPU hours, or total cost for all 1288 methods) is not stated. The paper mentions a 'limited budget' but does not quantify it."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "CITYWALK outperforms seven state-of-the-art baselines across all four evaluation metrics (CSR, EPR, CovL, CovB) on C++ unit test generation.",
    297       "evidence": "Tables 4-7 show CITYWALK achieves 83.39% CSR (vs. 31.84% best baseline GPT-4o), 73.35% EPR (vs. 29.59%), 44.46% CovL (vs. 19.68%), and 37.70% CovB (vs. 16.15%).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "CITYWALK generates fewer failed test cases than baselines, with only 36.66% failing.",
    302       "evidence": "Table 8 reports 1021 failed out of 2785 total test cases (36.66%), compared to 81.16% for GPT-4o and 89.01% for DeepSeek-V3.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Configuration dependencies (Depc) are the most impactful component, with removal causing 29.49% drop in CovL and 24.58% drop in CovB.",
    307       "evidence": "Tables 14-15 show ablation results where w/o Depc has the largest average degradation across all components.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "CITYWALK integrates as a plug-in and consistently improves three additional LLMs (CodeGeeX4, DeepSeek-V3, GPT-3.5).",
    312       "evidence": "Tables 10-13 show consistent improvements when CITYWALK is applied to each LLM. Performance gains scale with model parameter count.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "CITYWALK-generated test cases achieve mutation scores of 81.12% (tinyxml2) and 89.34% (json.cpp), demonstrating bug-detection capability.",
    317       "evidence": "Table 18 reports mutation testing results. Section 5.1 notes these exceed EvoSuite's 34.1% and universalmutator's 35% on their respective benchmarks.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "CITYWALK-generated test cases are preferred by developers over LLM baselines in readability and usability.",
    322       "evidence": "Table 20 shows five participants rated CITYWALK higher across Naming Intuitiveness (2.87), Code Layout (2.70), Assertion Quality (2.46), and Adoption Efforts (2.70) compared to all four LLM baselines.",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "CITYWALK's performance is driven by designed components rather than GPT-4o's memorization of training data.",
    327       "evidence": "Four post-cutoff projects were included; json.cpp (post-cutoff) achieved higher mutation score (89.34%) than tinyxml2 (pre-cutoff, 81.12%). Section 5.1.",
    328       "supported": "moderate"
    329     }
    330   ],
    331   "methodology_tags": [
    332     "benchmark-eval"
    333   ],
    334   "key_findings": "CITYWALK, an LLM-based framework for C++ unit test generation, substantially outperforms seven baselines including GPT-4o, achieving 83.39% compilation success rate (vs. 31.84% for GPT-4o) and 44.46% line coverage (vs. 19.68%) across 1288 focal methods from ten C++ projects. The framework's key contributions are project-level dependency extraction (configuration and cross-file data dependencies), RAG-based intention context retrieval, and language-specific domain knowledge derived from empirical error analysis. Ablation studies show configuration dependencies are the most impactful component, and the approach generalizes across multiple LLMs as a plug-in enhancement.",
    335   "red_flags": [
    336     {
    337       "flag": "No statistical significance tests",
    338       "detail": "All comparative claims ('outperforms', 'surpasses') are based on raw metric comparisons without any statistical tests. With single-run deterministic experiments (temperature=0), there is no way to assess whether observed differences are robust to variations in model behavior or sampling."
    339     },
    340     {
    341       "flag": "Very small human evaluation sample",
    342       "detail": "The human evaluation uses only 5 participants evaluating 25 focal methods. No inter-rater reliability measures are reported, no recruitment details are given, and no statistical analysis of the human ratings is performed. This sample is too small to make generalizable claims about developer preference."
    343     },
    344     {
    345       "flag": "Mutation score comparison across different benchmarks",
    346       "detail": "Section 5.1 compares CITYWALK's mutation scores (81-89%) with EvoSuite's score (34.1%) from the SBST 2022 competition and universalmutator's average (35%). These are on entirely different benchmarks, making the comparison misleading."
    347     },
    348     {
    349       "flag": "Single-run deterministic experiments",
    350       "detail": "All experiments use temperature=0 with a single run, so there is no variance reporting. While deterministic, different model API versions could produce different results, and the robustness of results to prompt variations is unknown."
    351     },
    352     {
    353       "flag": "Baselines limited to single-round fixes for fairness",
    354       "detail": "Section 4.1.1 notes that baselines (ChatTester, HITS) perform only a single round of iterative fixes 'similar to CITYWALK', which is 'much fewer than the number of iterations used in their original papers.' This may understate baseline performance relative to their published results."
    355     }
    356   ],
    357   "cited_papers": [
    358     {
    359       "title": "ChatUniTest: A Framework for LLM-Based Test Generation",
    360       "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi", "Junxiao Han", "Shuiguang Deng", "Jianwei Yin"],
    361       "year": 2024,
    362       "doi": "10.1145/3663529.3663801",
    363       "relevance": "LLM-based unit test generation framework using generate-validate-fix mechanism, direct comparison baseline."
    364     },
    365     {
    366       "title": "Evaluating and Improving ChatGPT for Unit Test Generation",
    367       "authors": ["Zhiqiang Yuan", "Mingwei Liu", "Shiji Ding", "Kaixin Wang", "Yixuan Chen", "Xin Peng", "Yiling Lou"],
    368       "year": 2024,
    369       "doi": "10.1145/3660783",
    370       "relevance": "ChatTester: evaluation of ChatGPT for Java unit test generation with intent comprehension, direct comparison baseline."
    371     },
    372     {
    373       "title": "HITS: High-Coverage LLM-Based Unit Test Generation via Method Slicing",
    374       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    375       "year": 2024,
    376       "doi": "10.1145/3691620.3695501",
    377       "relevance": "LLM-based high-coverage test generation through program decomposition, direct comparison baseline."
    378     },
    379     {
    380       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    381       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    382       "year": 2024,
    383       "doi": "10.1109/TSE.2023.3334955",
    384       "relevance": "TestPilot: empirical evaluation of LLMs for automated unit test generation, direct comparison baseline."
    385     },
    386     {
    387       "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM",
    388       "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang", "Xiaofei Ma", "Murali Krishna Ramanathan", "Baishakhi Ray"],
    389       "year": 2024,
    390       "doi": "10.1145/3643769",
    391       "relevance": "SymPrompt: uses symbolic execution-based path information in prompts for coverage-guided test generation."
    392     },
    393     {
    394       "title": "On the Evaluation of Large Language Models in Unit Test Generation",
    395       "authors": ["Lin Yang", "Chen Yang", "Shutao Gao"],
    396       "year": 2024,
    397       "doi": "10.1145/3691620.3695529",
    398       "relevance": "Evaluation framework for LLMs in unit test generation, establishing metrics used in this work."
    399     },
    400     {
    401       "title": "Automated Unit Test Improvement using Large Language Models at Meta",
    402       "authors": ["Nadia Alshahwan", "Jubin Chheda", "Anastasia Finogenova"],
    403       "year": 2024,
    404       "doi": "10.1145/3663529.3663839",
    405       "relevance": "Industrial application of LLMs for unit test improvement at Meta, relevant to practical LLM-based testing."
    406     },
    407     {
    408       "title": "ChatGPT vs SBST: A Comparative Assessment of Unit Test Suite Generation",
    409       "authors": ["Yutian Tang", "Zhijie Liu", "Zhichao Zhou", "Xiapu Luo"],
    410       "year": 2024,
    411       "doi": "10.1109/TSE.2024.3382365",
    412       "relevance": "Compares ChatGPT with search-based software testing for unit test generation."
    413     },
    414     {
    415       "title": "Software Testing With Large Language Models: Survey, Landscape, and Vision",
    416       "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen", "Zhe Liu", "Song Wang", "Qing Wang"],
    417       "year": 2024,
    418       "doi": "10.1109/TSE.2024.3368208",
    419       "relevance": "Comprehensive survey of LLM-based software testing covering the landscape and research directions."
    420     },
    421     {
    422       "title": "DeepSeek-V3 Technical Report",
    423       "authors": ["DeepSeek-AI"],
    424       "year": 2024,
    425       "arxiv_id": "2412.19437",
    426       "relevance": "Technical report for DeepSeek-V3, one of the LLMs evaluated as both baseline and CITYWALK integration target."
    427     },
    428     {
    429       "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
    430       "authors": ["Ziyao Zhang", "Yanli Wang", "Chong Wang", "Jiachi Chen", "Zibin Zheng"],
    431       "year": 2024,
    432       "arxiv_id": "2409.20550",
    433       "relevance": "Studies LLM hallucination in code generation, directly relevant to understanding errors in LLM-generated test code."
    434     }
    435   ]
    436 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs