scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30360B)
      1 {
      2   "paper": {
      3     "title": "A System for Automated Unit Test Generation Using Large Language Models and Assessment of Generated Test Suites",
      4     "authors": [
      5       "Andrea Lops",
      6       "Fedelucio Narducci",
      7       "Azzurra Ragone",
      8       "Michelantonio Trizio",
      9       "Claudio Bartolini"
     10     ],
     11     "year": 2024,
     12     "venue": "International Conference on Software Testing, Verification and Validation Workshops",
     13     "arxiv_id": "2408.07846",
     14     "doi": "10.1109/ICSTW64639.2025.10962454"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor",
     19     "data_leakage"
     20   ],
     21   "methodology_tags": [
     22     "benchmark-eval",
     23     "case-study"
     24   ],
     25   "key_findings": "AGONETEST automates end-to-end unit test generation and evaluation for Java projects using LLMs. On 10 repositories (94 focal classes), approximately 75% of LLM-generated test classes compiled and ~34% passed all tests. GPT-4 achieved higher coverage than GPT-3.5-turbo across most metrics, but human-written tests consistently outperformed both models on mutation coverage (0.69 vs. 0.55 best LLM). Zero-shot prompting unexpectedly outperformed few-shot for GPT-4.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper provides an anonymous link (https://anonymous.4open.science/r/classes2test) for the CLASSES2TEST dataset, but no link to the AGONETEST system code is provided. Only the dataset is released, not the tool."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The CLASSES2TEST dataset is released via an anonymous repository link (footnote 1). The dataset extends METHODS2TEST with 147,473 test classes from 9,410 repositories."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using Python libraries (tiktoken, LiteLLM, ElementTree) and Java build tools but does not provide a reproducible environment specification."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are included. The paper describes the system architecture and workflow but does not provide commands, scripts, or a README for reproducing the experiments."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Tables IV and V report only point estimates (e.g., 0.879 instruction coverage) with no confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper claims GPT-4 zero-shot yields the best results and that human tests outperform LLMs on mutation coverage, but no statistical significance tests are performed for any comparison."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Tables show raw coverage values but the magnitude of differences between configurations is not formally characterized."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Section V-A states 10 repos were 'chosen to be representative enough of the variability encountered in real-world projects, while ensuring that is tractable by our system in terms of scale.' This is a vague rationalization, not a statistical justification or power analysis."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No variance, standard deviation, or spread measures are reported. Temperature is set to 0, implying single deterministic runs, but no discussion of variability across different projects or classes beyond aggregate averages."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Human-written tests from the CLASSES2TEST dataset serve as a baseline. Table V includes a 'human' row for comparison across all metrics."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "The paper extensively discusses EvoSuite and other automated test generation tools (ChatTester, TestPilot, CEDAR) in Section II but does not include any of them as experimental baselines. Only human-written tests and two LLM configurations are compared."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No ablation study is performed on AGONETEST components. The comparison of zero-shot vs. few-shot is a technique comparison, not an ablation of system components (e.g., automated context extraction, library integration)."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Five coverage metrics are used (instruction, branch, line, method, mutation coverage) plus 19 test smell indicators. Table V reports all coverage metrics; Table II includes test smells."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No human evaluation of generated test quality is performed. All evaluation is automated through JaCoCo, PiTest, and TsDetect."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The 10 repositories are randomly selected from CLASSES2TEST, but there is no explicit held-out test set separation. The few-shot example comes from a separate repository (junit5-samples), but no dev/test split is discussed."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table II provides per-project, per-class breakdowns of metrics and test smells. Table IV and V break results by model and prompt technique."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section V-C discusses compilation failures and test failures in detail. Table IV quantifies rejection rates (30-38% test pass rate). Section VI-A1 identifies root causes: incorrect imports, syntax errors, calls to non-existent APIs."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper reports that ~66% of generated tests were rejected (Table IV), human tests consistently outperform on mutation coverage (Table V), and few-shot surprisingly did not help GPT-4. Section VI acknowledges room for improvement."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims 'a scalable automated software system, a new dataset, and a detailed methodology for evaluating test quality.' The paper delivers the system description (Section III-IV), the dataset (CLASSES2TEST, Table I), and the evaluation methodology (Section V). Claims are appropriately scoped."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Section VI states 'we found better results for zero-shot prompts for gpt-4 than for the few-shot one,' implying the prompting technique caused the difference. No statistical tests or controlled experimentation isolates this effect from confounds like example quality or interaction effects."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section VII-A explicitly states: 'its scope is limited to Java projects. This makes our findings hardly generalizable to different programming languages.' Section VII-B acknowledges the limited number of LLMs and prompts tested."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section VI discusses alternative explanations: the few-shot example may not be optimal, temperature settings restrict output diversity (Section VII-B), compilation failures may stem from LLM limitations rather than test design issues (Section VI-A1)."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section VII-D acknowledges: 'While we employed a comprehensive set of metrics and test smell indicators, these metrics alone may not fully capture the quality of the test suite.' This explicitly distinguishes measured proxies (coverage, smells) from the broader outcome (test quality)."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Listing 1 specifies 'gpt-4-1106-preview' (a versioned model) but 'gpt-3.5-turbo' without a snapshot date. The gpt-3.5-turbo endpoint has changed behavior over time, making results non-reproducible without a pinned version."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Full prompt templates for both zero-shot and few-shot are provided in Listing 1, with variable placeholders ({{focal_class}}, {{testing_framework}}, {{java_version}}) whose values are deterministically extractable from the released CLASSES2TEST dataset with saved commit hashes."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Temperature is explicitly set to 0 and the choice is justified in Section V-B: 'to increase the level of coherence in text generation (and to decrease the level of randomness) and make the diverse test suite generated comparable.' No other hyperparameters (top_p, max_tokens) are reported."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "AGONETEST uses a linear pipeline (prompt → LLM call → extract code → compile → evaluate) with no agentic scaffolding, retry loops, or iterative LLM interaction. The LLM is called once per focal class."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section III-A documents the 4-step process for creating CLASSES2TEST from METHODS2TEST. Section V-C describes the filtering pipeline: generated tests → compilation check → green suite check → metric computation, with rejection counts in Table IV."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VII 'Limitations' is a dedicated section with four subsections (Dataset and Generalization, Model and Prompt Variability, Compilation and Execution Failures, Evaluation Metrics)."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Limitations are specific to this study: Java-only scope (VII-A), only two models tested (VII-B), temperature=0 may limit diversity (VII-B), 66% test rejection rate (VII-C), bias toward well-structured codebases in dataset selection (VII-A)."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section VII-A: 'its scope is limited to Java projects.' VII-B: 'our initial experimental setup involved only two models... and two prompt types.' VII-B: 'results might vary significantly with newer models and advanced prompts.' Explicit about what was NOT tested."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The CLASSES2TEST dataset is available but the raw experimental outputs (individual test files, per-class metrics, CSV results) are not released. Only aggregate results are shown in tables."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section III-A describes the 4-step process for building CLASSES2TEST: extract repository references, select classes from METHODS2TEST, clone repositories and save commit hashes, map focal classes to test classes. Table I summarizes dataset characteristics."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. The data source is the standard METHODS2TEST benchmark, which is a publicly available dataset of open-source Java repositories."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented across Sections III-V: dataset construction (4 steps, Section III-A), configuration extraction (Section III-B), prompt creation (Section III-C), test generation (Section III-D), filtering of failing tests (Section V-C with counts in Table IV), and metric computation (Section III-E)."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding source or acknowledgments section is present in the paper. One author is affiliated with Wideverse (a company), suggesting potential industry funding, but this is not disclosed."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are listed: Polytechnic University of Bari, Wideverse (company), University of Bari, and one Independent Researcher. The evaluated models are from OpenAI, with which the authors have no disclosed affiliation."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding is disclosed, so funder independence cannot be assessed. One author is from Wideverse, a company, but no funding relationship is described."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial disclosure statement is present in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for either GPT-4 or GPT-3.5-turbo. This is important since the test repositories are from GitHub and may be in the models' training data."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No discussion of whether the GitHub repositories in CLASSES2TEST appeared in the training data of GPT-4 or GPT-3.5-turbo. The 9,410 repositories are public and likely included in OpenAI's training data."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "METHODS2TEST was published in 2022; GPT-4's training data extends well past this. The public GitHub repositories could have been seen during model training, including their test classes. This contamination risk is not discussed."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. All evaluation is automated on code repositories."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in this study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No API costs, tokens consumed, or wall-clock time per test generation are reported. The paper mentions using tiktoken for token counting but does not report actual token usage or costs."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No computational budget is stated. The paper mentions scalability challenges (Section VI-A3) but does not quantify GPU hours, API spend, or hardware used."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Temperature is set to 0, implying deterministic outputs, but the paper does not discuss whether results are sensitive to this choice or report results across different settings."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The paper does not explicitly state the number of experimental runs. Temperature=0 implies single deterministic runs, but this is not stated."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No hyperparameter search is performed. Temperature=0 is chosen without exploring alternatives. Section VII-B acknowledges 'Different temperature settings could yield more varied results, which were not explored in this study.'"
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "All four configurations (2 models × 2 prompts) are reported in Table V without cherry-picking. The paper explicitly states 'this does not mean that gpt-4 is the best model for all scenarios, and this is neither the objective of our experiment' (Section VI)."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Multiple comparisons are made across 4 configurations and 5+ metrics with no statistical tests at all, let alone multiple comparison corrections."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors evaluate their own AGONETEST system and implement the prompt strategies, but do not acknowledge or discuss self-evaluation bias."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "GPT-4 and GPT-3.5-turbo have significantly different compute costs, but no cost-performance analysis is provided. Results are compared without considering the cost differential."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Section VII-D briefly notes 'these metrics alone may not fully capture the quality of the test suite' but provides no substantive analysis of whether coverage metrics and test smells actually measure test quality as claimed."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "All model comparisons use the same AGONETEST pipeline with identical prompt templates and evaluation infrastructure. The scaffold is held constant across all configurations."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The GitHub repositories in CLASSES2TEST are public and many predate GPT-4's training cutoff. The models may have seen both the focal classes and their human-written tests during training. This is not discussed."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The prompt provides the focal class to the LLM. If the LLM has memorized the corresponding test class from training data, the generated test is recall rather than generation. This possibility is not discussed."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether the 9,410 GitHub repositories in CLASSES2TEST overlap with OpenAI's training data. Given these are popular open-source repos, high overlap is probable."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, temporal splits, or decontamination pipelines."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "AGONETEST provides an end-to-end automated process for generating and evaluating test suites without human intervention.",
    377       "evidence": "System architecture described in Section III with modules for repository selection, configuration, prompt creation, test generation, and assessment. Section IV provides an end-to-end example. Section VI-RQ1 discusses automation degree.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Approximately 75% of LLM-generated test classes compile successfully.",
    382       "evidence": "Table IV shows compilation rates: gpt-3.5-turbo 68-69%, gpt-4 80.85%. Average across 4 configurations is ~75%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Approximately 34% of generated test classes are green suite (all tests pass) with calculable mutation coverage.",
    387       "evidence": "Table IV shows pass rates: gpt-3.5-turbo 38-37%, gpt-4 31-30%. Rejected percentages range 62-70%.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "GPT-4 outperforms GPT-3.5-turbo on most coverage metrics.",
    392       "evidence": "Table V shows GPT-4 zero-shot achieves higher instruction (0.879 vs 0.757), branch (0.777 vs 0.706), line (0.866 vs 0.777), and method coverage (0.855 vs 0.848). No statistical tests support this comparison.",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "Human-written tests consistently outperform LLM-generated tests on mutation coverage.",
    397       "evidence": "Table V shows human mutation coverage of 0.691 vs best LLM at 0.547 (GPT-4 zero-shot and GPT-3.5 zero-shot). No statistical test performed.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Zero-shot prompting yields better results for GPT-4 than few-shot prompting.",
    402       "evidence": "Table V shows GPT-4 zero-shot outperforms GPT-4 few-shot on instruction (0.879 vs 0.753), line (0.866 vs 0.782), and mutation coverage (0.546 vs 0.462). Based on a single example for few-shot, no statistical tests.",
    403       "supported": "weak"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "Tiny sample size",
    409       "detail": "Only 10 repositories with 94 focal classes are used for evaluation, and after filtering, results are based on as few as 29-36 green suite classes per configuration (Table IV). This is too small to draw reliable conclusions about LLM test generation quality."
    410     },
    411     {
    412       "flag": "No statistical tests",
    413       "detail": "All comparisons between models, prompt strategies, and human baselines are based solely on comparing raw numbers. No significance tests, confidence intervals, or effect sizes are reported for any claim."
    414     },
    415     {
    416       "flag": "Survivorship bias in evaluation",
    417       "detail": "Approximately 66% of generated tests are rejected (non-compiling or failing). Metrics are computed only on the surviving ~34%. This systematically inflates the apparent quality of LLM-generated tests by excluding failures."
    418     },
    419     {
    420       "flag": "Training data contamination risk",
    421       "detail": "The 9,410 GitHub repositories are public and likely present in GPT-4 and GPT-3.5-turbo training data. The models may have memorized test classes rather than generating them. This fundamental validity threat is not discussed."
    422     },
    423     {
    424       "flag": "Missing automated tool baselines",
    425       "detail": "Despite extensive discussion of EvoSuite, ChatTester, TestPilot, and other tools in Section II, none are included as experimental baselines. Only human-written tests are compared against."
    426     },
    427     {
    428       "flag": "Single few-shot example",
    429       "detail": "The few-shot prompt always uses the same example from junit5-samples (Section V-B). The poor few-shot results for GPT-4 may reflect the choice of example rather than a property of few-shot prompting, but this is not explored."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "EvoSuite: Automatic Test Suite Generation for Object-Oriented Software",
    435       "authors": ["G. Fraser", "A. Arcuri"],
    436       "year": 2011,
    437       "relevance": "Foundational search-based automated test generation tool for Java, the primary baseline approach that AGONETEST aims to improve upon."
    438     },
    439     {
    440       "title": "An Initial Investigation of ChatGPT Unit Test Generation Capability",
    441       "authors": ["V. Guilherme", "A. Vincenzi"],
    442       "year": 2023,
    443       "relevance": "Early empirical evaluation of ChatGPT for unit test generation, examining hyperparameter impact on test quality."
    444     },
    445     {
    446       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    447       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    448       "year": 2023,
    449       "relevance": "Empirical evaluation of LLMs for automated unit test generation, one of the key prior works in this area."
    450     },
    451     {
    452       "title": "Using Large Language Models to Generate JUnit Tests: An Empirical Study",
    453       "authors": ["M. L. Siddiq", "J. C. Da Silva Santos", "R. H. Tanvir"],
    454       "year": 2024,
    455       "relevance": "Evaluation of LLMs for JUnit test generation with new metrics, directly comparable to AGONETEST's evaluation methodology."
    456     },
    457     {
    458       "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
    459       "authors": ["Z. Yuan", "Y. Lou", "M. Liu"],
    460       "year": 2023,
    461       "arxiv_id": "2305.04207",
    462       "relevance": "ChatTester tool for evaluating and improving LLM-generated unit tests, a direct precursor to AGONETEST."
    463     },
    464     {
    465       "title": "ChatGPT vs SBST: A Comparative Assessment of Unit Test Suite Generation",
    466       "authors": ["Y. Tang", "Z. Liu", "Z. Zhou", "X. Luo"],
    467       "year": 2024,
    468       "relevance": "Direct comparison of ChatGPT vs search-based software testing for unit test generation."
    469     },
    470     {
    471       "title": "Methods2Test: A Dataset of Focal Methods Mapped to Test Cases",
    472       "authors": ["M. Tufano", "S. K. Deng", "N. Sundaresan", "A. Svyatkovskiy"],
    473       "year": 2022,
    474       "relevance": "The foundational dataset (780,944 method-test pairs from 9,410 repos) that CLASSES2TEST extends to class-level."
    475     },
    476     {
    477       "title": "Unit Test Case Generation with Transformers and Focal Context",
    478       "authors": ["M. Tufano", "D. Drain", "A. Svyatkovskiy"],
    479       "year": 2020,
    480       "arxiv_id": "2009.05617",
    481       "relevance": "AthenaTest: early transformer-based approach to unit test generation using fine-tuned BART model."
    482     },
    483     {
    484       "title": "Retrieval-Based Prompt Selection for Code-Related Few-Shot Learning",
    485       "authors": ["N. Nashid", "M. Sintaha", "A. Mesbah"],
    486       "year": 2023,
    487       "relevance": "CEDAR: retrieval-based prompt construction for code generation few-shot learning, relevant to prompt engineering for test generation."
    488     },
    489     {
    490       "title": "ChatUnitTest: A Framework for LLM-Based Test Generation",
    491       "authors": ["Y. Chen", "Z. Hu", "C. Zhi"],
    492       "year": 2024,
    493       "relevance": "LLM-based test generation framework, directly relevant to automated test generation infrastructure."
    494     },
    495     {
    496       "title": "Large Language Models Are Few-Shot Testers: Exploring LLM-Based General Bug Reproduction",
    497       "authors": ["S. Kang", "J. Yoon", "S. Yoo"],
    498       "year": 2023,
    499       "relevance": "Explores LLM-based bug reproduction and test generation, relevant to automated correction of generated test code."
    500     },
    501     {
    502       "title": "Large Language Models Are Zero-Shot Fuzzers: Fuzzing Deep-Learning Libraries via Large Language Models",
    503       "authors": ["Y. Deng", "C. S. Xia", "H. Peng"],
    504       "year": 2023,
    505       "relevance": "LLM-based fuzzing for software testing, demonstrating zero-shot test generation capabilities."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "AGONETEST addresses a real practitioner need (automated test generation), but the tool code is not released and the 34% green suite rate limits immediate usability."
    512     },
    513     "surprise_contrarian": {
    514       "score": 0,
    515       "justification": "Results confirm expected patterns: GPT-4 beats GPT-3.5, human tests beat LLMs on mutation coverage, many generated tests don't compile."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No security or safety implications; the paper addresses software testing automation."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversy or conflict; straightforward empirical evaluation of test generation approaches."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "The CLASSES2TEST dataset is available via anonymous link, but the AGONETEST tool code is not released for others to try."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "Uses GPT-4 and GPT-3.5 (well-known models) but the paper and authors are from a regional university, not a major AI lab."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs