scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28190B)
      1 {
      2   "paper": {
      3     "title": "Benchmarking LLMs for Unit Test Generation from Real-World Functions",
      4     "authors": [
      5       "Dong Huang",
      6       "Jie M. Zhang",
      7       "Mark Harman",
      8       "Qianru Zhang",
      9       "Mingzhe Du",
     10       "See-Kiong Ng"
     11     ],
     12     "year": 2025,
     13     "venue": "ACM (manuscript submitted)",
     14     "arxiv_id": "2508.00408",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub link: https://github.com/huangd1999/UnLeakedTestBench. The abstract states 'We also make ULT and evaluation results publicly available to foster further research.'"
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The benchmark (ULT) is publicly released at the GitHub repository. The paper explicitly states the 3,909 Python functions are released, though ground-truth tests are withheld to prevent contamination. Evaluation results for all 12 models are also provided."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper mentions a 'unified and standardized Docker environment' (Section 7.5.1) for test execution but does not provide the Dockerfile, requirements.txt, or specific library versions used. No detailed environment specification is given."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions providing a 'self-contained evaluation script' for metric computation, but the paper itself does not include step-by-step reproduction instructions. There is no README with commands described in the paper text. The instructions for reproducing the full experimental setup are not detailed."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (e.g., '41.32% accuracy', '45.10% line coverage'). No confidence intervals, error bars, or uncertainty measures are reported for any metric."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports Pearson correlation coefficients with p-values (e.g., r=0.79, p=0.002 for ULT; r=0.56, p=0.059 for TestEval) and also Spearman and Kendall rank correlations with p-values in Section 6.2.2 and Figure 3."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports correlation coefficients (r=0.79, r=0.56, r=0.52) as effect sizes for the relationship between coding ability and test generation performance. Performance differences are presented with baseline context (e.g., 'average Pass@1 across all models is 12.69% on ULT, compared to 48.42% on PLT and 57.74% on TestEval')."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is provided for why 12 LLMs were selected, why 3,909 functions constitute a sufficient benchmark size, or why 1,000 additional functions were sampled for PLT mutation testing. No power analysis is discussed."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper uses greedy decoding (temperature=0) to produce deterministic outputs, so there is no variance across runs. However, the paper does not report any variance or distribution information across the 3,909 functions (e.g., standard deviation of per-function metrics). All results are single-run averages."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares ULT against two baseline benchmarks: TestEval and PLT. Performance of 12 LLMs is compared across all three benchmarks with multiple metrics."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "TestEval (2024) is a recent benchmark. The 12 LLMs evaluated include contemporary models like Qwen2.5-Coder, Seed-Coder, Gemma-3, and Phi-4. The paper also compares cyclomatic complexity distributions with TestGenEval (2024)."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper includes ablation-like analysis: (1) comparison of feedback-driven vs. no-feedback query strategies (Table 6), (2) controlled comparison between contaminated (PLT) and decontaminated (ULT) subsets to isolate the effect of data contamination, and (3) analysis of performance as a function of cyclomatic complexity."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses four distinct metrics: test generation accuracy (Pass@k), line coverage (LCov@k), branch coverage (BCov@k), and mutation score (Mut@k). Each is reported at k=1, 2, and 5."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation is included. All evaluation is automated via test execution, coverage measurement, and mutation testing. Given that the paper makes claims about test quality and the realism of generated tests, human evaluation of test readability or meaningfulness could have been valuable."
     92       },
     93       "held_out_test_set": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "This is a benchmark construction paper, not a machine learning training paper. There is no model training or tuning involved, so the concept of a held-out test set does not apply in the traditional sense."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down per model (12 models individually in Tables 2-5), per cyclomatic complexity bin (Figures 1-2), and per metric (Pass@k, LCov@k, BCov@k, Mut@k). Per-model breakdowns are comprehensive."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The paper does not include qualitative analysis of failure cases. There is no discussion of specific functions where LLMs fail, what kinds of code structures are most challenging, or examples of incorrectly generated tests. All analysis is quantitative and aggregate."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports that accuracy on TestEval decays sharply with increasing k (e.g., deepseek-coder-33b drops from 72.86% to 42.55% at k=20), and that the no-feedback strategy produces much weaker results. These represent informative negative results."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims about lower performance on ULT (41.32% accuracy, 45.10% line coverage, 30.22% branch coverage, 40.21% mutation score) and strong correlation (r=0.79, p=0.002) are all supported by the detailed results in Tables 2-5 and Figure 3."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper's main causal claim is that data contamination inflates benchmark performance. This is supported by a controlled comparison between ULT (decontaminated) and PLT (contaminated), which share the same construction process except for the decontamination step. This controlled design adequately supports the causal inference."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 7.5.2 (External Validity) explicitly bounds generalizations: the study is limited to Python, to 12 specific LLMs, to self-contained functions, and may not represent all programming paradigms. The paper states 'caution should be exercised when generalizing them to other types of software.'"
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The Threats to Validity section (7.5) discusses multiple alternative explanations: the performance gap between ULT and TestEval could partly reflect differences in task nature beyond complexity (Section 7.5.3), BigCodeBench may not perfectly proxy coding ability, and the iterative prompt design could influence results. They also acknowledge cyclomatic complexity may be a proxy for code size."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Table 1 lists specific model identifiers: CodeLlama-7b-Instruct-hf, Seed-Coder-8B-Instruct, deepseek-coder-1.3b-instruct, deepseek-coder-6.7b-instruct, deepseek-coder-33b-instruct, gemma-3-4b-it, gemma-3-12b-it, gemma-3-27b-it, Qwen2.5-Coder-7B-Instruct, Qwen2.5-Coder-14B-Instruct, Qwen2.5-Coder-32B-Instruct, Phi-4-mini-instruct. These are specific model names with size variants, though no API snapshot dates are given for locally-run models."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper describes the prompt structure in natural language (Section 3.3: 'prompted to generate a single unit test case', 'explicitly instructed to generate one new test case that is distinct') but does not provide the actual prompt text used. No appendix with full prompts is included."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4.3 reports: temperature=0.0 for greedy decoding, max_tokens=1024. For the no-feedback ablation, temperature=0.2. These are the key hyperparameters for inference."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The LLMs are prompted directly for test generation in an iterative but straightforward manner without tools, memory, or feedback loops beyond the previous test cases being included in the prompt."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3.2 documents the full multi-stage filtering pipeline: data collection from The Stack v2, filtering by cyclomatic complexity (>=10), self-containment check, testability guarantee (with GPT-4o debugging, limit of 3 attempts), and decontamination by searching for test definitions. Final counts are provided: 3,909 for ULT, 18,169 for PLT."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7.5 'Threats to Validity' contains three substantive subsections: Internal Validity (7.5.1), External Validity (7.5.2), and Construct Validity (7.5.3), spanning approximately 1.5 pages."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The threats are specific to this study: reproducibility concerns with LLM decoding strategies mitigated by temperature=0, Docker environment for consistent test execution, representativeness limited to Python and self-contained functions from The Stack v2, reliance on BigCodeBench as proxy for coding ability, and the influence of prompt wording on iterative test generation."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7.5.2 explicitly states scope boundaries: limited to Python, limited to 12 specific LLMs, limited to self-contained functions, 'may not encompass the full spectrum of programming paradigms, application domains, or coding styles.' The paper notes findings 'may not directly translate to other languages like Java, C++, or JavaScript.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The benchmark functions are publicly released at https://github.com/huangd1999/UnLeakedTestBench. The paper states 'We publicly release the curated set of 3,909 Python functions' and complete evaluation results for all benchmarked models."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.2 describes data collection from The Stack v2 corpus with specific filtering criteria: cyclomatic complexity >= 10, self-containment, testability guarantee with at least 3 test inputs per function, and decontamination via name-based search."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved. The data source is a standard public code corpus (The Stack v2)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.2 documents the full pipeline: collection from The Stack v2 -> cyclomatic complexity filtering (>=10) -> self-containment filtering -> testability guarantee (with GPT-4o verification) -> decontamination (name-based search for test definitions). Final counts are given: 3,909 for ULT, 18,169 for PLT. However, intermediate counts at each stage are not provided."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: National University of Singapore, King's College London, University College London, and University of Cambridge. None of the authors are affiliated with the companies whose models are being evaluated (open-source models from various companies)."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Since funding is not disclosed at all, it is impossible to assess whether the funder is independent of the outcome. The absence of a funding disclosure is a gap."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state the training data cutoff dates for the 12 LLMs evaluated. This is a notable gap given that the paper is centrally about data contamination. They address contamination through their benchmark design (decontamination of the test data) but do not report when each model's training data was collected."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "This is a central theme of the paper. The entire ULT/PLT design is built around addressing train/test overlap. Section 3.2 describes the decontamination process, and RQ3 (Section 6.2) directly analyzes the effect of contamination by comparing ULT vs PLT performance."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Contamination is the paper's primary concern. The benchmark is specifically designed to mitigate it through name-based decontamination, and Section 7.4 describes measures to avoid future contamination (withholding ground-truth tests, providing evaluation scripts, licensing restrictions)."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. It is a benchmark evaluation of LLMs."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported for running the 12 LLMs across the benchmark tasks. The paper uses locally-run open-source models but does not report compute time or resource usage."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No computational budget is stated. The hardware used for inference and evaluation (GPUs, machines) is not described. The mutation testing timeout is mentioned (120 seconds per function) but total compute time is not reported."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "ULT is significantly more challenging than existing benchmarks, with LLMs achieving average Pass@5 of 12.57% on ULT compared to 44.45% on PLT and 51.93% on TestEval.",
    294       "evidence": "Table 2 shows Pass@k scores across all 12 LLMs for ULT, PLT, and TestEval. The overall averages confirm the stated numbers. Similar patterns hold for line coverage (Table 3), branch coverage (Table 4), and mutation score (Table 5).",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "ULT shows a strong correlation between test generation performance and code generation ability (r=0.79, p=0.002), while TestEval and PLT show weaker, non-significant correlations.",
    299       "evidence": "Figure 3 and Section 6.2.2 report Pearson, Spearman, and Kendall correlations between BigCodeBench scores and test generation metrics. ULT shows r=0.79 (p=0.002) for Pass@1, while TestEval shows r=0.56 (p=0.059) and PLT shows r=0.52 (p=0.080).",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "Data contamination inflates LLM performance on test generation benchmarks, as evidenced by the controlled comparison between ULT (decontaminated) and PLT (contaminated).",
    304       "evidence": "Section 6.2 shows consistent performance gaps between ULT and PLT across all models and metrics even at equivalent cyclomatic complexity levels (Figure 2). For branch coverage, PLT shows negligible correlation with coding ability (r=0.22, p=0.492) while ULT shows strong correlation (r=0.77, p=0.004), suggesting PLT performance is driven by memorization.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "ULT's higher cyclomatic complexity explains performance differences, with a clear negative correlation between complexity and LLM performance.",
    309       "evidence": "Figure 1 shows declining branch coverage with increasing cyclomatic complexity across all models. ULT has mean complexity of 14.87 (range 10-82) vs TestEval's 12.35 (range 10-40). However, even at equivalent complexity levels [10,20), ULT shows ~40% lower branch coverage than TestEval (Section 6.1).",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "Accuracy decay on TestEval as k increases is a symptom of data contamination, where initial high accuracy reflects memorization that degrades when novelty is required.",
    314       "evidence": "Figure 4 shows accuracy on TestEval drops sharply (e.g., deepseek-coder-33b from 72.86% at k=1 to 42.55% at k=20), while ULT accuracy remains stable. This is presented as evidence for memorization but is hypothesized rather than directly measured. Alternative explanations (e.g., task difficulty increasing when constrained to novel tests) are not fully ruled out.",
    315       "supported": "moderate"
    316     }
    317   ],
    318   "methodology_tags": [
    319     "benchmark-eval"
    320   ],
    321   "key_findings": "ULT (UnLeakedTestBench) is a new benchmark of 3,909 real-world Python functions with high cyclomatic complexity (>=10) and rigorous decontamination for evaluating LLM-based unit test generation. Across 12 LLMs, ULT proves significantly more challenging than TestEval and PLT, with average Pass@5 of only 12.57% compared to 51.93% and 44.45% respectively. The controlled comparison between contaminated (PLT) and decontaminated (ULT) subsets provides strong evidence that data contamination inflates existing benchmark scores. Performance on ULT correlates strongly with general coding ability (r=0.79, p=0.002), suggesting it measures genuine reasoning rather than memorization.",
    322   "red_flags": [
    323     {
    324       "flag": "No variance or uncertainty quantification",
    325       "detail": "Despite evaluating across 3,909 functions, the paper reports only aggregate averages without any standard deviation, confidence intervals, or per-function variance. The use of temperature=0 eliminates run-to-run variance but does not address the question of how stable the aggregate metrics are across different function subsets."
    326     },
    327     {
    328       "flag": "Missing intermediate pipeline counts",
    329       "detail": "The data pipeline starts from The Stack v2 but does not report how many functions existed at each intermediate stage (after complexity filtering, after self-containment filtering, after testability guarantee). Only the final counts (3,909 ULT, 18,169 PLT) are given, making it harder to assess selection bias."
    330     },
    331     {
    332       "flag": "Prompts not provided",
    333       "detail": "For a paper about benchmarking LLM test generation, the actual prompts used are not included. The iterative prompt structure is described in natural language (Section 3.3) but the exact text given to models is not shown, limiting reproducibility."
    334     },
    335     {
    336       "flag": "GPT-4o used in benchmark construction",
    337       "detail": "Section 3.2 describes using GPT-4o to check for bugs and fix functions during the testability guarantee stage. This introduces a potential confound: functions that GPT-4o could understand and fix may be systematically different from those it could not, potentially biasing the benchmark toward certain code patterns."
    338     },
    339     {
    340       "flag": "Name-based decontamination may be incomplete",
    341       "detail": "The decontamination process searches for test functions matching 'test_func_name' or assertions using 'assert func_name'. This name-based approach could miss test cases that reference the function through aliases, indirect calls, or different naming conventions, potentially leaving some contamination in ULT."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Testeval: Benchmarking large language models for test case generation",
    347       "authors": ["W. Wang", "C. Yang", "Z. Wang", "Y. Huang", "Z. Chu", "D. Song", "L. Zhang", "A. R. Chen", "L. Ma"],
    348       "year": 2024,
    349       "arxiv_id": "2406.04531",
    350       "relevance": "Primary baseline benchmark for evaluating LLM test generation capabilities, directly compared against ULT."
    351     },
    352     {
    353       "title": "Testgeneval: A real world unit test generation and test completion benchmark",
    354       "authors": ["K. Jain", "G. Synnaeve", "B. Rozière"],
    355       "year": 2024,
    356       "arxiv_id": "2410.00752",
    357       "relevance": "Another benchmark for LLM test generation from real-world code, compared in terms of cyclomatic complexity distribution."
    358     },
    359     {
    360       "title": "SWE-bench: Can language models resolve real-world github issues?",
    361       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig", "S. Yao", "K. Pei", "O. Press", "K. Narasimhan"],
    362       "year": 2023,
    363       "arxiv_id": "2310.06770",
    364       "relevance": "Foundational benchmark for evaluating LLMs on software engineering tasks; SWT-Bench and TestGenEval derive from it."
    365     },
    366     {
    367       "title": "Swt-bench: Testing and validating real-world bug-fixes with code agents",
    368       "authors": ["N. Mündler", "M. Müller", "J. He", "M. Vechev"],
    369       "year": 2024,
    370       "relevance": "Benchmark that transforms SWE-Bench code repair tasks into test generation tasks for issue reproduction."
    371     },
    372     {
    373       "title": "An empirical evaluation of using large language models for automated unit test generation",
    374       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    375       "year": 2023,
    376       "relevance": "Foundational empirical evaluation of LLMs for unit test generation, providing early insights into capabilities and limitations."
    377     },
    378     {
    379       "title": "Lessleak-bench: A first investigation of data leakage in LLMs across 83 software engineering benchmarks",
    380       "authors": ["X. Zhou", "M. Weyssow", "R. Widyasari", "T. Zhang", "J. He", "Y. Lyu", "J. Chang", "B. Zhang", "D. Huang", "D. Lo"],
    381       "year": 2025,
    382       "arxiv_id": "2502.06215",
    383       "relevance": "Investigates data leakage across SE benchmarks, directly motivating ULT's decontamination approach."
    384     },
    385     {
    386       "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    387       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    388       "year": 2023,
    389       "relevance": "Hybrid approach combining LLMs with search-based software testing for test generation, relevant to the survey's coverage of LLM-based testing tools."
    390     },
    391     {
    392       "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions",
    393       "authors": ["T. Y. Zhuo", "M. C. Vu", "J. Chim", "H. Hu", "W. Yu", "R. Widyasari"],
    394       "year": 2024,
    395       "arxiv_id": "2406.15877",
    396       "relevance": "Used as proxy for LLMs' general coding ability in the correlation analysis; relevant benchmark for code generation evaluation."
    397     },
    398     {
    399       "title": "Large language models for software engineering: A systematic literature review",
    400       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    401       "year": 2024,
    402       "relevance": "Comprehensive survey of LLMs in software engineering, providing broader context for the test generation evaluation."
    403     },
    404     {
    405       "title": "Reflexion: Language agents with verbal reinforcement learning",
    406       "authors": ["N. Shinn", "F. Cassano", "A. Gopinath", "K. Narasimhan", "S. Yao"],
    407       "year": 2023,
    408       "relevance": "Influential agentic LLM framework that enhances reasoning and learning for coding tasks through verbal reinforcement."
    409     },
    410     {
    411       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    412       "authors": ["D. Huang", "J. M. Zhang", "M. Luck", "Q. Bu", "Y. Qing", "H. Cui"],
    413       "year": 2023,
    414       "arxiv_id": "2312.13010",
    415       "relevance": "Multi-agent framework for code generation with iterative testing, relevant to agentic AI approaches in software engineering."
    416     },
    417     {
    418       "title": "Large-scale, independent and comprehensive study of the power of LLMs for test case generation",
    419       "authors": ["W. C. Ouédraogo", "K. Kaboré", "H. Tian", "Y. Song", "A. Koyuncu", "J. Klein", "D. Lo", "T. F. Bissyandé"],
    420       "year": 2024,
    421       "arxiv_id": "2407.00225",
    422       "relevance": "Large-scale empirical study of LLM test generation capabilities, providing complementary evidence on the topic."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs