scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30296B)
      1 {
      2   "paper": {
      3     "title": "On the Evaluation of Large Language Models in Unit Test Generation",
      4     "authors": [
      5       "Lin Yang",
      6       "Chen Yang",
      7       "Shutao Gao",
      8       "Weijing Wang",
      9       "Bo Wang",
     10       "Qihao Zhu",
     11       "Xiao Chu",
     12       "Jianyi Zhou",
     13       "Guangtai Liang",
     14       "Qianxiang Wang",
     15       "Junjie Chen"
     16     ],
     17     "year": 2024,
     18     "venue": "ASE '24",
     19     "arxiv_id": "2406.18181",
     20     "doi": "10.1145/3691620.3695529"
     21   },
     22   "scan_version": 3,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval"],
     25   "key_findings": "Open-source LLMs (CodeLlama, DeepSeek-Coder variants from 7B to 34B) all underperform traditional EvoSuite in unit test coverage, primarily due to 34-62% syntactically invalid test generation caused by LLM hallucination. Prompt design significantly affects effectiveness: aligning description style with training data and managing code feature inclusion to balance context vs. generation space matters more than model scale. Chain-of-Thought helps only models with strong code comprehension (DeepSeek-Coder), while RAG adapted from code generation consistently hurts all models. Defect detection is weak: on average 87% of defects lack any valid generated tests, and among testable defects, 75% go undetected mainly due to missing specific trigger inputs.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 2.6 states 'All of our code and data are available at our project homepage [5]' linking to https://github.com/LeonYang95/LLM4UT."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "They use the publicly available Defects4J 2.0 benchmark and claim all data is available at their project homepage [5]."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 2.6 specifies 'Ubuntu 18.04 LTS, Intel Xeon Gold 6240C CPU, 512GB RAM, and eight NVIDIA A100 GPUs' with 'PyTorch 2.0.0 and transformers 4.34.1, and used VLLM libraries.'"
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Code and data released at project homepage [5] on GitHub. The paper describes the experimental pipeline in sufficient detail (Section 2.6) and the repository presumably contains runnable scripts."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Main results (Tables 1-6) report only point estimates (CSR, CovL, CovB percentages) with no confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper uses Wilcoxon rank sum tests with significance level 0.05 throughout (Sections 3.1-3.3), reporting p-values to assess statistical significance of differences."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 3.1 reports 'Rank-biserial correlation scores to show the effect size' with a threshold of >0.3 for meaningful differences, following Cohen (2013)."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "778 focal methods from 413 defects across 17 projects are used. Section 4 states 'our experimental design balances conclusion generalizability and evaluation costs well' but no power analysis or formal sample size justification is provided."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Temperature is set to 0 for determinism, producing single-run results. No variance, standard deviation, or spread measures are reported across experimental runs."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper compares against EvoSuite (traditional technique) and GPT-4 (commercial LLM) as reference baselines in Table 4."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "GPT-4 was the state-of-the-art commercial LLM at the time of study. EvoSuite is the widely-used standard baseline for automated test generation. Both are appropriate."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "RQ1 performs extensive ablation on prompt design, removing one code feature at a time (Tables 2-3) to measure each feature's contribution. RQ3 tests adding CoT and RAG."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Four metrics are used: Compilation Success Rate (CSR), Line Coverage (CovL), Branch Coverage (CovB), and Number of Detected Defects (NDD)."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 3.4: four authors with 4+ years Java experience manually analyzed and labeled undetected defects into three categories (Cohen's Kappa = 0.95). This constitutes human evaluation of the system's outputs."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The best prompt design is selected in RQ1 using the same Defects4J data, then used for RQ2-4 evaluation on the same data. No separate held-out set for configuration selection vs. final evaluation."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down per model (Tables 1-6), per prompt variant (Tables 2-3), per error type (Section 3.2), and per defect failure reason (Table 7). Per-project breakdowns are at the homepage."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 3.2 analyzes three main compilation error types (unresolved symbols 30.68%, parameter mismatch 17.25%, abstract instantiation 10.38%). Section 3.4 analyzes why defects are undetected (Table 7)."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Multiple negative results: RAG hurts all models (Table 5), CoT hurts CodeLlama models, FCm inclusion hurts coverage despite helping validity, all LLMs underperform EvoSuite."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims about prompt design influence (Tables 1-3), open-source vs GPT-4 vs EvoSuite performance (Table 4), ICL limitations (Table 5), and defect detection weaknesses (Table 6-7) are all supported by corresponding results sections."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Causal claims (e.g., 'removing FCm improves coverage', 'CoT improves DeepSeek-Coder') are based on controlled ablation experiments with single-variable manipulation and statistical testing."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The paper consistently references '17 Java projects from Defects4J', 'five open-source LLMs', and acknowledges in Section 4 that results are limited to their specific benchmark and model selections. Findings are framed as specific to the tested setting."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper discusses alternative explanations: training data alignment for style sensitivity (Section 3.1), code comprehension ability differences for CoT results (Section 3.3), token space trade-offs for coverage effects (Section 3.1), and repetition issues for CL-7B anomaly (Section 3.2)."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Metrics (CSR, line/branch coverage, defect detection count) directly measure what is claimed. No proxy gap exists — the paper measures test validity and coverage and reports them as such."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Open-source models are precisely named (CodeLlama-7B-Instruct, Phind-CodeLlama-34B-v2, DeepSeekCoder-6.7B-Instruct, etc.) but GPT-4 is referred to only as 'GPT-4' without a snapshot date or API version."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 2.3 describes prompt design in detail (NL vs CL styles, code features), and the paper links to a GitHub repository [5] containing all code, which includes the prompt implementations."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Only temperature=0 is reported (Section 4). Other key settings like max tokens, top-p, and generation length limits are not specified, despite the paper's own argument that token space affects output quality."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The approach is single-pass prompt→response with post-processing (AST extraction and compilation)."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 2.5-2.6 documents: selection of public patched methods as focal methods (778 from 413 defects), AST-based test extraction using tree-sitter, integration into test classes, and import resolution."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 'Threats to Validity' provides substantive discussion across internal, external, and construct validity dimensions."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4 discusses specific threats: code review and testing of experimental scripts, model selection criteria based on HuggingFace leaderboard, inability to evaluate all code feature combinations, specific data leakage analysis comparing generated vs original tests, and plans to extend to GitBug-Java."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 states specific boundaries: 'we may not find the globally optimal setting for each studied LLM', acknowledges the benchmark is limited to Defects4J Java projects, and discusses plans to extend to more recent benchmarks like GitBug-Java."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 2.6: 'All of our code and data are available at our project homepage [5]' on GitHub, enabling access to raw experimental outputs."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 2.5 describes using Defects4J 2.0, selecting patched public methods as focal methods, resulting in 778 focal methods from 413 defects across 17 projects."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. Data source is the standard Defects4J benchmark."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented: focal method selection → prompt construction → LLM generation → AST-based test extraction → test class integration → import resolution → compilation → execution → coverage measurement via JaCoCo."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Acknowledgments section lists NSFC grants (62322208, 62202040, 62232001, 12411530122) and CCF-Huawei Populus Grove Fund."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Author affiliations are listed. Four authors (Xiao Chu, Jianyi Zhou, Guangtai Liang, Qianxiang Wang) are from Huawei Cloud Computing Co. Ltd., and the first author completed this work during a Huawei internship."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The CCF-Huawei Populus Grove Fund is partially funded by Huawei, and four co-authors are Huawei employees. Huawei has commercial interest in LLM-based software engineering tools, making the funder non-independent."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is included in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for any of the models used. The paper acknowledges 'a potential data leakage threat' but does not specify when models' training data was collected."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section 4: 'we compared LLM-generated unit tests with the original unit tests equipped by this benchmark. We found that there is no exact match between them, and even the number of LLM-generated unit tests (3.70 on average) is largely different with that of original unit tests (2.41).'"
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Section 4 acknowledges 'a potential data leakage threat' with Defects4J (published 2014, before all models' training). They perform a basic comparison analysis and note plans to extend to more recent benchmarks (GitBug-Java)."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. The manual analysis of defects is a labeling task by authors, not a human subjects study."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No per-inference cost or latency is reported. GPT-4 API costs are not mentioned. No wall-clock time per generation is provided."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states 'around 3,000 NVIDIA A100 GPU-hours' for all experiments, and specifies hardware: 'four servers with eight NVIDIA A100 GPUs each.'"
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Temperature is set to 0 for determinism but no seed sensitivity analysis is performed. No discussion of whether other sources of randomness (e.g., VLLM batching) affect results."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single deterministic runs, but this is never stated directly."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The ablation on prompt design explores a systematic set of variants, but no hyperparameter search budget (total configurations tried, compute spent on search) is reported."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "Section 3.1 describes a systematic ablation approach: best description style per model is selected based on statistical significance, and best code feature set is selected from ablation results. The selection criterion is clearly stated."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Many Wilcoxon rank sum tests are conducted across models, styles, and feature variants, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors use original model weights and EvoSuite with default settings, but do not discuss self-comparison bias or acknowledge that their experimental framework choices could favor certain outcomes."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "Models ranging from 7B to 34B parameters are compared without discussing compute cost per inference. GPT-4 uses substantially more compute than 7B models but this is not analyzed."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Defects4J is used without discussion of whether it adequately represents real-world unit test generation scenarios. No analysis of potential selection bias in the benchmark's projects or defect types."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No agentic scaffolding is used. All models go through the same single-pass generation and post-processing pipeline."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Section 4 acknowledges 'a potential data leakage threat' and notes Defects4J was published before model training. They compare generated vs original tests as a basic temporal leakage check and plan to extend to more recent benchmarks."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether input features (code context in prompts) could leak test-relevant information beyond what a real developer would have. The evaluation setup is not analyzed for information leakage."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "Multiple focal methods come from the same 17 projects but no discussion of whether this creates non-independence in the results. Methods from the same project likely share code patterns."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": true,
    370         "justification": "Section 4: they compare LLM-generated tests with original Defects4J tests, finding 'no exact match' and different average counts (3.70 vs 2.41). This is a concrete, though limited, detection method."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Prompt description style significantly affects LLM effectiveness: CodeLlama models perform better with natural language style, while DeepSeek-Coder models are robust to style choice.",
    377       "evidence": "Table 1 shows CL-7B and CL-13B achieve significantly higher CSR, CovL, and CovB with NL style (all six cases statistically significant). DC-7B, DC-33B, PD-34B show no significant difference in 7/9 cases. Root cause traced to training data alignment (Section 3.1).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Including other methods in the focal class (FCm) helps syntactic validity but harms test coverage due to consuming prompt space.",
    382       "evidence": "Table 2: removing FCm decreases CSR by 2.14-14.92% (statistically significant for all models). Table 3: removing FCm increases coverage in almost all cases with statistical significance. Average generated tests increase from 3,654 to 5,434 after removal.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "All studied LLMs, including GPT-4, underperform traditional EvoSuite in test coverage.",
    387       "evidence": "Table 4: EvoSuite achieves 78.91% CovL and 76.59% CovB vs GPT-4's 40.43% CovL and 31.78% CovB. The gap is attributed to 34-62% syntactically invalid LLM-generated tests.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Chain-of-Thought improves DeepSeek-Coder models but hurts CodeLlama models in unit test generation.",
    392       "evidence": "Table 5: CoT brings +2.72% CovL for DC-7B (statistically significant) but -3.04% for CL-7B and -6.45% for CL-13B (statistically significant). Attributed to differences in code comprehension ability.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "RAG adapted from code generation consistently hurts all LLMs in unit test generation.",
    397       "evidence": "Table 5: all five open-source LLMs show decreased CovL (3.34-9.28%) and CovB (4.44-8.78%) with RAG. Cause: significant gap between retrieved tests (avg 12.10 LOC, 2.41 per method) and generated tests (avg 5.60 LOC, 6.94 per method).",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "Defect detection ability of LLM-generated tests is weak: 87.13% of defects have no valid tests, and only 47.28% of testable defects are detected.",
    402       "evidence": "Table 6: NTD ranges from 28 to 65 out of 413 defects. NDD ranges from 12 to 39. Table 7 analyzes reasons: 74.99% of undetected defects due to missing specific defect-triggering inputs.",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "LLM hallucination causes three main types of compilation errors: unresolved symbol (30.68%), parameter mismatch (17.25%), and abstract instantiation (10.38%).",
    407       "evidence": "Section 3.2 provides detailed breakdown of error types from analyzing invalid generated tests across all LLMs.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "red_flags": [
    412     {
    413       "flag": "Huawei conflict of interest",
    414       "detail": "Four of eleven authors are from Huawei Cloud Computing, the first author completed the work during a Huawei internship, and the study is partially funded by CCF-Huawei Populus Grove Fund. Huawei has commercial interest in AI-powered software engineering tools. No competing interests statement is provided."
    415     },
    416     {
    417       "flag": "No held-out evaluation set",
    418       "detail": "Best prompt configurations are selected in RQ1 on the same Defects4J data used for evaluation in RQ2-4. This means configuration selection and final evaluation use identical data, inflating apparent performance of the 'best' settings."
    419     },
    420     {
    421       "flag": "Weak contamination analysis",
    422       "detail": "Data leakage analysis consists only of comparing generated test counts and checking for exact matches with original tests. This is insufficient — models could have memorized patterns, structures, or partial solutions without producing exact copies."
    423     },
    424     {
    425       "flag": "No multiple comparison correction",
    426       "detail": "Dozens of Wilcoxon rank sum tests are conducted across models, styles, features, and ICL methods without any correction for family-wise error rate, increasing the risk of false positives."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
    432       "authors": ["Zhiqiang Yuan", "Yiling Lou", "Mingwei Liu"],
    433       "year": 2023,
    434       "arxiv_id": "2305.04207",
    435       "relevance": "Directly relevant prior study evaluating ChatGPT for unit test generation with CoT, serving as key comparison point."
    436     },
    437     {
    438       "title": "ChatUniTest: a ChatGPT-based automated unit test generation tool",
    439       "authors": ["Zhuokui Xie", "Yinghao Chen", "Chen Zhi"],
    440       "year": 2023,
    441       "arxiv_id": "2305.04764",
    442       "relevance": "ChatGPT-based unit test generation tool with self-repair capability, evaluated in prior work on LLM test generation."
    443     },
    444     {
    445       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    446       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    447       "year": 2024,
    448       "relevance": "Empirical evaluation of GPT-3.5 for JavaScript unit test generation (TestPilot), directly comparable study design."
    449     },
    450     {
    451       "title": "Exploring the Effectiveness of Large Language Models in Generating Unit Tests",
    452       "authors": ["Mohammed Latif Siddiq", "Joanna C. S. Santos"],
    453       "year": 2023,
    454       "arxiv_id": "2305.00418",
    455       "relevance": "Prior study evaluating GPT-3.5 and Codex effectiveness in unit test generation."
    456     },
    457     {
    458       "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    459       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri"],
    460       "year": 2023,
    461       "relevance": "Combines evolutionary search (EvoSuite-style) with LLM code generation to overcome coverage plateaus."
    462     },
    463     {
    464       "title": "Evaluating Large Language Models Trained on Code",
    465       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    466       "year": 2021,
    467       "arxiv_id": "2107.03374",
    468       "relevance": "Codex evaluation paper establishing LLM code generation benchmarks and methodology."
    469     },
    470     {
    471       "title": "Code Llama: Open Foundation Models for Code",
    472       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    473       "year": 2023,
    474       "arxiv_id": "2308.12950",
    475       "relevance": "Foundation model paper for CodeLlama, one of the two model families evaluated in this study."
    476     },
    477     {
    478       "title": "EvoSuite: automatic test suite generation for object-oriented software",
    479       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    480       "year": 2011,
    481       "relevance": "The primary traditional baseline for automated test generation used as comparison point in this study."
    482     },
    483     {
    484       "title": "Effective test generation using pre-trained Large Language Models and mutation testing",
    485       "authors": ["Arghavan Moradi Dakhel", "Amin Nikanjam"],
    486       "year": 2024,
    487       "relevance": "Combines LLMs with mutation testing for test generation, exploring code feature effects on prompt design."
    488     },
    489     {
    490       "title": "Enhancing LLM-based Test Generation for Hard-to-Cover Branches via Program Analysis",
    491       "authors": ["Chen Yang", "Junjie Chen", "Bin Lin"],
    492       "year": 2024,
    493       "arxiv_id": "2404.04966",
    494       "relevance": "TELPA system integrating LLMs with program analysis for test generation, from overlapping author team."
    495     },
    496     {
    497       "title": "Large Language Models for Software Engineering: Survey and Open Problems",
    498       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    499       "year": 2023,
    500       "relevance": "Comprehensive survey of LLMs in software engineering covering testing, code generation, and open challenges."
    501     },
    502     {
    503       "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
    504       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    505       "year": 2022,
    506       "arxiv_id": "2210.14179",
    507       "relevance": "Evaluates LLMs for automated program repair using similar Defects4J benchmark, with data leakage discussion."
    508     },
    509     {
    510       "title": "Unit Test Case Generation with Transformers",
    511       "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy"],
    512       "year": 2020,
    513       "arxiv_id": "2009.05617",
    514       "relevance": "AthenaTest: early DL-based unit test generation using BART transformer, foundational prior work."
    515     },
    516     {
    517       "title": "Software Testing With Large Language Models: Survey, Landscape, and Vision",
    518       "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen"],
    519       "year": 2024,
    520       "relevance": "Comprehensive survey of LLM-based software testing including test generation, relevant to survey scope."
    521     }
    522   ],
    523   "engagement_factors": {
    524     "practical_relevance": {
    525       "score": 2,
    526       "justification": "Provides actionable guidelines for prompt design and LLM selection for unit test generation, though not a ready-to-use tool."
    527     },
    528     "surprise_contrarian": {
    529       "score": 1,
    530       "justification": "Some mildly surprising findings (RAG hurts all models, EvoSuite still dominates) but these confirm growing skepticism rather than overturning beliefs."
    531     },
    532     "fear_safety": {
    533       "score": 0,
    534       "justification": "No AI safety or security concerns raised."
    535     },
    536     "drama_conflict": {
    537       "score": 0,
    538       "justification": "No controversy or dramatic claims; straightforward empirical study."
    539     },
    540     "demo_ability": {
    541       "score": 1,
    542       "justification": "Code released on GitHub but it's an experimental framework, not a user-facing tool."
    543     },
    544     "brand_recognition": {
    545       "score": 1,
    546       "justification": "Uses GPT-4 as reference and published at ASE, but authors are not from a widely-recognized AI lab."
    547     }
    548   }
    549 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs