scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25262B)
      1 {
      2   "paper": {
      3     "title": "On the Evaluation of Large Language Models in Unit Test Generation",
      4     "authors": [
      5       "Lin Yang",
      6       "Chen Yang",
      7       "Shutao Gao",
      8       "Weijing Wang",
      9       "Bo Wang",
     10       "Qihao Zhu",
     11       "Xiao Chu",
     12       "Jianyi Zhou",
     13       "Guangtai Liang",
     14       "Qianxiang Wang",
     15       "Junjie Chen"
     16     ],
     17     "year": 2024,
     18     "venue": "ASE '24",
     19     "arxiv_id": "2406.18181",
     20     "doi": "10.1145/3691620.3695529"
     21   },
     22   "scan_version": 2,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval"],
     25   "key_findings": "Prompt design (description style and code features) significantly affects LLM unit test generation effectiveness, with alignment to training data style being crucial. All studied LLMs including GPT-4 underperform traditional Evosuite in test coverage, primarily due to 34-62% syntactically invalid tests caused by hallucination. CoT and RAG methods adapted from other tasks do not consistently improve unit test generation. Defect detection ability is weak — on average 87% of defects have no valid generated tests, and only 47% of testable defects are detected.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper states 'All of our code and data are available at our project homepage' with a GitHub link (reference [5])."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "They use the publicly available Defects4J 2.0 benchmark and state all data is available at their homepage."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 2.6 specifies PyTorch 2.0.0, transformers 4.34.1, VLLM, Ubuntu 18.04 LTS, Intel Xeon Gold 6240C CPU, 512GB RAM, eight NVIDIA A100 GPUs."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "While code and data are released, the paper itself does not contain step-by-step reproduction instructions. It references the homepage but no README or reproduction guide is described."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Tables report point estimates (CSR, CovL, CovB percentages) with no confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Wilcoxon rank sum test with significance level 0.05 and Rank-biserial correlation effect size (>0.3 threshold) used throughout Sections 3.1-3.3."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Rank-biserial correlation scores reported as effect size measure, with >0.3 threshold for meaningful difference. Percentage differences between conditions are also reported."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "778 focal methods from 413 defects across 17 projects are used, but no justification for why this sample size is sufficient. No power analysis."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Temperature is set to zero for determinism, and results are reported as single-run numbers. No variance across runs is reported."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Evosuite (traditional approach) and GPT-4 (commercial LLM) are included as baselines for comparison in Section 3.2."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "GPT-4 was state-of-the-art at time of study. Evosuite is the most widely-used traditional approach. Open-source LLMs selected from Hugging Face leaderboard."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 3.1 performs ablation on code features (removing one feature at a time from the prompt) across all five LLMs, with results in Tables 2-3."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Four metrics used: Compilation Success Rate (CSR), Line Coverage (CovL), Branch Coverage (CovB), and Number of Detected Defects (NDD)."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Four authors with 4+ years Java experience manually analyzed and labeled undetected defects by reason (Section 3.4), with Cohen's Kappa of 0.95 reported."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "They use the Defects4J 2.0 benchmark which provides separate buggy and fixed versions. No tuning on the test data — prompts were designed via ablation on the same data, but the focal methods themselves are fixed by the benchmark."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results broken down per LLM model, per prompt variant, per code feature, and per error type. Table 7 breaks down undetected defects by three reasons."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 3.2 analyzes three main types of compilation errors (unresolved symbols 30.68%, parameter mismatch 17.25%, abstract instantiation 10.38%). Section 3.4 categorizes reasons for undetected defects."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "RAG hurts all models (Table 5). CoT hurts CodeLlama models. LLMs underperform Evosuite. CL-13B performs worse than CL-7B. Multiple negative findings reported throughout."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims about prompt influence, open-source vs GPT-4 comparison, and LLM limitations are all supported by results in Sections 3.1-3.4."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Causal claims from ablation studies (removing code features) use controlled single-variable manipulation. Claims about why description style matters are supported by analysis of training data composition."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title says 'Unit Test Generation' broadly but results are only for Java (Defects4J). The paper does not explicitly bound findings to Java. Section 4 mentions extending to other benchmarks as future work but doesn't bound current claims."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4 discusses threats to validity including potential data leakage, prompt design not being globally optimal, and ICL method adaptations possibly not being the best. Analysis of underlying reasons for results is thorough throughout."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper measures specific metrics (CSR, line/branch coverage, NDD) and discusses them at the granularity of measurement without overframing. It acknowledges coverage is not the only dimension and discusses readability as a separate concern."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Specific model names with sizes: CodeLlama-7B-Instruct, CodeLlama-13B-Instruct, Phind-CodeLlama-34B-v2, DeepSeekCoder-6.7B-Instruct, DeepSeekCoder-33B-Instruct. GPT-4 is stated without specific version/snapshot."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Prompts are described conceptually (natural language vs code language style, which code features included) but actual prompt text is not provided in the paper or appendix."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Temperature set to zero is stated (Section 4). Framework versions (PyTorch 2.0.0, transformers 4.34.1) specified in Section 2.6."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. LLMs are prompted directly for test generation."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 2.5 describes focal method selection (public methods from patched methods, 778 total from 413 defects). Section 2.6 describes AST parsing for output extraction, test class integration, and dependency importing."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 'Threats to Validity' provides substantive discussion of internal, external, and construct validity threats."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4 discusses specific threats: ablation may not find global optimum, potential data leakage with Defects4J, CoT/RAG adaptations may not be optimal, and extends to specific analysis of data leakage by comparing generated vs original test counts."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 acknowledges the benchmark is limited to Java, the code feature combinations are not exhaustively explored, and the ICL methods may not be the best adaptations. Future work to extend to GitBug-Java is mentioned."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "All code and data available at project homepage (reference [5]). Defects4J is a public benchmark."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 2.5 describes using Defects4J 2.0 with 835 defects from 17 projects, selecting public patched methods as focal methods, yielding 778 methods from 413 defects."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data comes from the standard Defects4J benchmark."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Section 2.6 documents the pipeline: LLM generates output → AST parser extracts tests → tests integrated into test class → dependencies imported → compilation → recursive error removal → coverage collection via JaCoCo."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Acknowledgments section lists National Natural Science Foundation of China grants and CCF-Huawei Populus Grove Fund."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations clearly shown: four authors from Huawei Cloud Computing Co. Ltd., others from Tianjin University, Beijing Jiaotong University, Peking University. First author notes internship at Huawei."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Huawei funds the research (CCF-Huawei Populus Grove Fund) and has four co-authors. Huawei has commercial interest in LLM-based software tools, making the funder non-independent of the outcome."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests statement is included in the paper despite significant Huawei involvement."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates stated for any of the evaluated models."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section 4 discusses potential data leakage: 'Following the existing practice, we compared LLM-generated unit tests with the original unit tests equipped by this benchmark' finding no exact match and different test counts (3.70 vs 2.41)."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "Defects4J was published in 2014 and is widely used. The paper acknowledges data leakage as a threat but does not address whether Defects4J code appeared in training data of the models (which were trained well after 2014)."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in the study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in the study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No per-example inference cost or latency reported. Only total GPU hours (3,000 A100 hours) mentioned."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 1 states 'Our experiments required around 3,000 NVIDIA A100 GPU-hours.' Hardware configuration described in Section 2.6."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Temperature set to zero for determinism. No multi-seed analysis. Single configuration per experiment."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": true,
    313         "justification": "Implicitly single run with temperature=0 for determinism, stated in Section 4."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No hyperparameter search budget reported. Temperature fixed at zero, but no discussion of whether other settings were explored."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Best prompt configuration selected via systematic ablation study (Section 3.1). The paper acknowledges this finds a local not global optimum in Section 4."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Many Wilcoxon tests performed across models, prompt variants, and metrics without any multiple comparison correction mentioned."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Authors compare their prompt designs and adaptations of CoT/RAG without acknowledging potential bias in their implementations. No independent evaluation."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No performance-vs-compute analysis. Evosuite likely uses far less compute than LLMs but this is not discussed."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Defects4J is used without discussing whether it adequately represents real-world unit test generation scenarios. No construct validity analysis."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No scaffolding used. LLMs are prompted directly."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Defects4J was published in 2014; all models trained after. No discussion of temporal leakage from models potentially seeing Defects4J solutions during training."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether evaluation setup leaks information. The prompt includes various code features from the focal class which is a design choice, not leakage, but no analysis."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether Defects4J projects or methods share structural similarities with LLM training data."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": true,
    370         "justification": "Section 4 compares LLM-generated tests with original benchmark tests, finding no exact matches and different counts (3.70 vs 2.41 average), as a basic leakage detection method."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Prompt description style significantly affects LLM effectiveness, with alignment to training data style being key.",
    377       "evidence": "Table 1 shows CL-7B and CL-13B perform significantly better with NL style (aligned with Llama2 training), while DeepSeek-Coder models are robust to style choice. Wilcoxon tests confirm significance.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Including other methods from the focal class (FCm) improves syntactic validity but reduces test coverage due to context window consumption.",
    382       "evidence": "Tables 2-3 show removing FCm decreases CSR by 2-15% but increases coverage. Average generated tests increase from 3,654 to 5,434 when FCm removed.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "All LLMs including GPT-4 underperform Evosuite in test coverage.",
    387       "evidence": "Table 4: Evosuite achieves 78.91% line coverage vs GPT-4's 40.43%. Evosuite CSR is 85.71% vs GPT-4's 52.96%.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "CoT helps DeepSeek-Coder but hurts CodeLlama models for unit test generation.",
    392       "evidence": "Table 5 shows CoT gives +2.72% CovL for DC-7B but -3.04% for CL-7B and -6.45% for CL-13B, attributed to code comprehension ability differences.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "RAG adapted from code generation is ineffective for unit test generation.",
    397       "evidence": "Table 5 shows RAG decreases coverage for all five models (up to -9.28% CovL for PD-34B). Gap between retrieved tests (12.10 LOC avg) and generated tests (5.60 LOC) identified as cause.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "87.13% of defects cannot be detected due to compilation issues.",
    402       "evidence": "Table 6 shows NTD ranges from 28-65 out of 413 defects, meaning vast majority have no valid tests. Only 47.28% of testable defects actually detected.",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Missing specific defect-triggering inputs is the main reason for undetected defects.",
    407       "evidence": "Table 7 shows 74.99% of undetected defects (among those with valid tests) are due to missing specific inputs. Manual analysis by 4 authors with Cohen's Kappa 0.95.",
    408       "supported": "strong"
    409     }
    410   ],
    411   "red_flags": [
    412     {
    413       "flag": "Huawei conflict of interest",
    414       "detail": "Four co-authors from Huawei Cloud Computing, CCF-Huawei funding, and first author interned at Huawei. No competing interests statement. While the paper evaluates open-source LLMs rather than Huawei products specifically, the industrial involvement is substantial and undisclosed as a conflict."
    415     },
    416     {
    417       "flag": "No prompts provided",
    418       "detail": "Despite prompt design being a central research question, actual prompt texts are not provided in the paper. Only conceptual descriptions of NL vs CL styles and code features are given, limiting reproducibility of the core contribution."
    419     },
    420     {
    421       "flag": "Contamination risk unaddressed",
    422       "detail": "Defects4J (2014) predates all evaluated models' training data. While the paper compares generated vs original tests as a basic check, it does not address whether models saw Defects4J source code or solutions during training."
    423     },
    424     {
    425       "flag": "GPT-4 version unspecified",
    426       "detail": "GPT-4 is used as a reference baseline but no specific version or API snapshot date is provided, making results unreproducible as GPT-4 behavior changes over time."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Evaluating Large Language Models Trained on Code",
    432       "authors": ["Mark Chen"],
    433       "year": 2021,
    434       "arxiv_id": "2107.03374",
    435       "relevance": "Introduces Codex/HumanEval, foundational work on LLM code generation evaluation."
    436     },
    437     {
    438       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
    439       "authors": ["Zhiqiang Yuan"],
    440       "year": 2023,
    441       "arxiv_id": "2305.04207",
    442       "relevance": "Proposes ChatTester for LLM-based unit test generation with ChatGPT; direct prior work."
    443     },
    444     {
    445       "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool",
    446       "authors": ["Zhuokui Xie"],
    447       "year": 2023,
    448       "arxiv_id": "2305.04764",
    449       "relevance": "ChatGPT-based unit test generation tool with self-repair; key baseline approach."
    450     },
    451     {
    452       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    453       "authors": ["Max Schäfer"],
    454       "year": 2024,
    455       "relevance": "TestPilot: empirical evaluation of GPT-3.5 for JavaScript unit test generation."
    456     },
    457     {
    458       "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    459       "authors": ["Caroline Lemieux"],
    460       "year": 2023,
    461       "relevance": "Combines evolutionary search with LLM code generation for test generation."
    462     },
    463     {
    464       "title": "Exploring the Effectiveness of Large Language Models in Generating Unit Tests",
    465       "authors": ["Mohammed Latif Siddiq"],
    466       "year": 2023,
    467       "arxiv_id": "2305.00418",
    468       "relevance": "Empirical study on GPT-3.5 and Codex for unit test generation."
    469     },
    470     {
    471       "title": "Code Llama: Open Foundation Models for Code",
    472       "authors": ["Baptiste Rozière"],
    473       "year": 2023,
    474       "arxiv_id": "2308.12950",
    475       "relevance": "Foundation model used in the study; major open-source code LLM family."
    476     },
    477     {
    478       "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair",
    479       "authors": ["Yuxiang Wei"],
    480       "year": 2023,
    481       "relevance": "LLM-based automated program repair approach, addresses hallucination in code generation."
    482     },
    483     {
    484       "title": "Large Language Models for Software Engineering: Survey and Open Problems",
    485       "authors": ["Angela Fan"],
    486       "year": 2023,
    487       "relevance": "Comprehensive survey of LLMs for SE tasks including testing."
    488     },
    489     {
    490       "title": "Software Testing With Large Language Models: Survey, Landscape, and Vision",
    491       "authors": ["Junjie Wang"],
    492       "year": 2024,
    493       "relevance": "Survey of LLM applications in software testing."
    494     },
    495     {
    496       "title": "Effective test generation using pre-trained Large Language Models and mutation testing",
    497       "authors": ["Arghavan Moradi Dakhel"],
    498       "year": 2024,
    499       "relevance": "LLM test generation combined with mutation testing for effectiveness."
    500     },
    501     {
    502       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    503       "authors": ["Fang Liu"],
    504       "year": 2024,
    505       "relevance": "Studies hallucination in LLM code generation, directly relevant to compilation errors found in this paper."
    506     }
    507   ]
    508 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs