ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32636B)


      1 {
      2   "paper": {
      3     "title": "No More Manual Tests? Evaluating and Improving ChatGPT for Unit Test Generation",
      4     "authors": [
      5       "Zhiqiang Yuan",
      6       "Mingwei Liu",
      7       "Shiji Ding",
      8       "Kaixin Wang",
      9       "Yixuan Chen",
     10       "Xin Peng",
     11       "Yiling Lou"
     12     ],
     13     "year": 2024,
     14     "venue": "Proc. ACM Softw. Eng. (FSE)",
     15     "arxiv_id": "2305.04207",
     16     "doi": "10.1145/3660783"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "qualitative"],
     21   "key_findings": "Only 24.8% of ChatGPT-generated unit tests pass execution, with 57.9% failing compilation (mainly symbol resolution errors) and 85.5% of execution failures caused by incorrect assertions. However, passing tests achieve comparable coverage, readability, and usability to manually-written tests. The proposed ChatTester approach, which decomposes generation into intention inference plus iterative compilation error fixing, improves compilable rate by 34.3% and passing rate by 18.7% over default ChatGPT, and generalizes to open-source LLMs (CodeLlama-Instruct, CodeFuse).",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper states 'The data and code can be found on our website [12]' and reference [12] provides the URL https://github.com/FudanSELab/ChatTester/tree/main."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Reference [12] links to a GitHub repository stated to contain data and code. The paper says the benchmark statistics can be found there."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or detailed environment specifications are provided in the paper. The paper mentions Maven-built Java projects and ChatGPT API but does not specify library versions or dependencies for reproduction."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. While a GitHub repo is referenced, the paper itself does not include a reproducing-results section or describe how to run the experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results are reported as point estimates (e.g., 24.8% passing rate, 42.1% compilation rate). Although the ChatTester evaluation was repeated 3 times, no confidence intervals or error bars accompany the averaged results."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are used anywhere in the paper. All comparative claims (e.g., ChatTester vs ChatGPT, ChatGPT vs AthenaTest) are based solely on comparing raw percentages without any statistical test."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Effect sizes are reported with baseline context throughout: '34.3% more compilable tests (= 73.3% - 39.0%)', '18.7% more tests with correct assertions (= 41.0% - 22.3%)'. Coverage improvements are similarly contextualized (Table 5, Table 10)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The main benchmark of 1,000 focal methods is described as sampled from 1,748 data pairs 'considering the costs of using ChatGPT API and the manual efforts' but no formal sample size justification or power analysis is given. The user study's 219-test subset for RQ7 is justified by '0.05 error margin at a 95% confidence level' but the main N=1,000 is not."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The ChatTester evaluation (RQ5-RQ6) is repeated 3 times with averages presented, but no standard deviation, IQR, or any spread measure is reported. The main study (RQ1-RQ4) was not repeated at all."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Two baselines are included: EvoSuite (search-based, state-of-the-art traditional technique) and AthenaTest (learning-based, reproduced with CodeT5). Manually-written tests are also used as reference."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "EvoSuite is a continuously-maintained tool and AthenaTest (2020) was the most relevant learning-based technique. The paper explicitly explains why other techniques like CODAMOSA and LIBRO are excluded (they target different scenarios, Section 8.2)."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 7 presents a clear ablation: ChatGPT (base), ChatTester-Ite (without iterative refiner), ChatTester-Ini (without initial generator), and full ChatTester. Each component's contribution is quantified."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple metrics are used: syntactic correctness, compilation rate, execution passing rate (RQ1), statement and branch coverage (RQ2), number of assertions (RQ2), readability scores (RQ3), and usability scores (RQ4)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "A user study with 5 participants evaluates readability and usability of generated tests in RQ3, RQ4, and RQ7. Participants scored tests on naming intuitiveness, code layout, assertion quality, and adoption efforts using detailed criteria (Table 1)."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The ChatTester evaluation (RQ5) uses a separate set of 100 data pairs from the remaining 748 not used in the empirical study, explicitly to 'avoid using the same benchmark that has been extensively analyzed' (Section 6.1)."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Extensive breakdowns are provided: compilation error categories (Table 3), execution error categories (Table 4), per-project coverage (Table 10), iteration distribution (Table 8), and per-participant readability/usability scores (Figures 5, 6, 10)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4.1 provides detailed failure analysis including a taxonomy of compilation errors (Table 3), execution errors (Table 4), and a concrete NullPointerException example (Figure 3). Section 4.5 discusses root causes of failures."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper prominently reports that only 24.8% of ChatGPT-generated tests pass execution. It extensively discusses failure modes and limitations, making negative results a central part of the contribution."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims match the results: '34.3% more compilable tests and 18.7% more tests with correct assertions' matches Table 7 (73.3%-39.0%=34.3%, 41.0%-22.3%=18.7%). Claims about correctness issues match Section 4.1. Generalization claims match Table 9."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper claims ChatTester 'improves' test correctness, which is a causal claim. The ablation study (Table 7) with controlled single-component removal (ChatTester-Ite, ChatTester-Ini) provides adequate evidence for component-level causal claims."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper explicitly states it focuses on gpt-3.5-turbo 'due to the limited rate of GPT-4 API' (Section 3.4), specifies Java as the primary language, and tests generalization to Python and two other LLMs. The title uses a question mark rather than an assertion."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 7 discusses alternative explanations: ChatGPT randomness as a source of result variation, potential data leakage inflating ChatGPT's apparent capability (with Levenshtein distance analysis), and benchmark-specific findings that may not generalize."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures compilation rate, passing rate, coverage, readability scores, and usability scores. These metrics directly align with the claimed evaluation dimensions (correctness, sufficiency, readability, usability). No proxy gap exists between measurements and claims."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper states 'we focus on the gpt-3.5-turbo model' without specifying a snapshot date or version (e.g., gpt-3.5-turbo-0301). CodeLlama-Instruct-34B and CodeFuse-34B are specified with model sizes but without version hashes."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt text is provided: Figure 1 shows the complete basic prompt including NL and CC parts, Figure 8 shows the intention and generation prompts, and Figure 9 shows the iterative refinement prompt with error messages and additional code context."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper states 'we use the official ChatGPT API with the default setting' without specifying temperature, top-p, max tokens, or other sampling parameters. These significantly affect output quality and reproducibility."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "ChatTester's scaffolding is described in detail: Figure 7 shows the complete workflow, the initial test generator's decomposition into intention and generation steps (Section 5.1), and the iterative test refiner's validate-and-fix loop with validator, EM parser, and code analyzer (Section 5.2)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 documents the full pipeline: 4,685 CodeSearchNet Java projects filtered by maintenance recency, 100+ stars, and Maven compilation → 185 projects → focal method/test pair extraction via path and class name matching → 1,748 pairs → sampling 1,000."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7 'THREATS TO VALIDITY' provides a dedicated discussion of three specific threats with substantive analysis including the Levenshtein distance computation."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The threats are specific to this study: (i) ChatGPT randomness mitigated by 3x repetition, (ii) benchmark may not generalize — addressed by diverse project selection and additional Python evaluation on HumanEval, (iii) data leakage quantified via Levenshtein distance (avg 1,027 chars, 5% < 200 chars)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper states it focuses on gpt-3.5-turbo rather than GPT-4 (Section 3.4), focuses on Java with extension to Python (Section 7), does not consider test completion scenarios (excluding Teco), and excludes CODAMOSA/LIBRO due to different scenarios (Section 8.2)."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The paper states 'The data and code can be found on our website [12]' pointing to a GitHub repository. The benchmark data pairs and generated tests are referenced as available."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 3.1 describes data collection in detail: project source (CodeSearchNet), cloning date (March 25, 2023), filtering criteria (recency, stars, Maven), focal-method-to-test matching procedure via file path and class name, and parameter-based disambiguation."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "For the user study, the paper states only 'We invite five participants whose Java development experiences vary from 4 years to 5 years' without describing how they were recruited (e.g., students, colleagues, external developers) or potential selection bias."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline is documented with counts: 4,685 Java projects → 185 after filtering → 1,748 data pairs extracted → 1,000 sampled for study, 100 from remaining 748 for ChatTester evaluation. Each filtering criterion and its effect are described."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source, grants, or acknowledgments section is visible in the paper text provided."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All seven authors are listed with their affiliation at the Department of Computer Science, Fudan University, China, with full email addresses."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, making it impossible to assess funder independence. The paper evaluates OpenAI's ChatGPT, and the authors are academic researchers with no disclosed connection to OpenAI."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial disclosure statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper does not state ChatGPT's training data cutoff date. It uses gpt-3.5-turbo without specifying when its training data was collected."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 7 threat (iii) explicitly discusses 'the potential data leakage of the manually-written tests being part of the training data in ChatGPT' and analyzes Levenshtein distance between generated and manual tests (avg 1,027 chars, only 5% < 200 chars)."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "The paper addresses contamination risk: the benchmark projects are from GitHub (which ChatGPT may have trained on), and Section 7 analyzes output similarity via Levenshtein distance to assess whether ChatGPT is copying from training data. For AthenaTest, overlap data is explicitly removed from fine-tuning."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No pre-registration of the user study is mentioned."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No IRB or ethics board approval is mentioned for the user study involving 5 human participants."
    261       },
    262       "demographics_reported": {
    263         "applies": true,
    264         "answer": true,
    265         "justification": "Minimal demographics are reported: 'five participants whose Java development experiences vary from 4 years to 5 years' (Section 3.4). Experience level is characterized, though other demographics are absent."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No inclusion or exclusion criteria for participant selection are stated. The paper only says 'We invite five participants' without describing any screening process."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "The user study is not an experimental study with treatment/control conditions requiring randomization. All participants evaluate the same set of tests."
    276       },
    277       "blinding_described": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "The paper states 'participants are not informed which test is generated by ChatGPT or which is written manually' (Section 3.4), describing single-blinding of the evaluation."
    281       },
    282       "attrition_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No attrition or dropout information is reported. The paper does not explicitly state that all 5 participants completed the full evaluation."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Wall-clock time is reported: 'The average time cost of ChatTester is around 99.0 seconds, where the initial test generator takes 15.0 seconds while each iteration in the iterative refiner takes 30.0 seconds on average' (Section 6.2.1). Times for CodeLlama and CodeFuse variants are also reported."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No total computational budget is stated. Per-test latency is reported but total API costs, total wall-clock time for the full experiment, or hardware specifications are not provided."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The ChatTester evaluation was repeated 3 times, but no variance or sensitivity analysis across runs is reported. Only averaged results are presented in Table 7 without spread measures."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 6.1 states 'we repeat all experiments three times and present the average results.' The main study (RQ1-RQ4) was not repeated, which is acknowledged in Section 7."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described. The paper uses 'default setting' for ChatGPT API and sets ChatTester parameters (max 3 invalid refinements) without describing how these values were chosen."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "ChatTester design choices (e.g., max 3 invalid refinements, max 10 iterations) are not justified by comparison with alternatives. Only the default configuration is evaluated."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own ChatTester system and their re-implementation of AthenaTest (since the original model was not released). They acknowledge 'we try our best to reproduce it' but do not discuss author-evaluation bias or the risk that their baseline reimplementation may underperform."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "ChatTester takes ~99s per test vs ~10s for default ChatGPT (multiple API calls plus compilation), but this compute difference is not discussed as a confound when comparing effectiveness. Performance is not normalized by compute budget."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether compilation rate and execution pass rate adequately measure test generation quality. No analysis of whether these metrics capture what developers actually need from generated tests beyond the user study."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "The ablation study (Table 7) isolates ChatTester's scaffold components. When comparing ChatGPT vs Evosuite, the paper explicitly states 'we do not intend to conclude that Evosuite is better at generating more correct tests' and explains the methodological differences. RQ6 applies the same ChatTester framework across different LLMs."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "Section 7 discusses that manually-written tests may be part of ChatGPT's training data, which is a temporal leakage concern (benchmark code existed before model training). They analyze Levenshtein distance to assess how much output resembles potential training data."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the prompt context (focal class fields, method signatures) provides hints beyond what would be available in realistic usage scenarios."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "For AthenaTest, the paper explicitly removes overlap data from the fine-tuning dataset 'if it is duplicated in our benchmark based on character matching' (Section 3.3). For ChatGPT, the Levenshtein distance analysis partially addresses independence concerns."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": true,
    366         "justification": "Levenshtein edit distance is used as a concrete detection method: 'average of 1,027 characters and median of 609 characters' distance between generated and manual tests, with 'only 5% of ChatTester-generated tests have less than 200-characters Levenshtein distance' (Section 7)."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Only 24.8% of ChatGPT-generated tests pass execution, while 57.9% fail compilation and 17.3% fail during execution.",
    373       "evidence": "Table 2 (Section 4.1) reports correctness metrics on 1,000 focal methods: 42.1% compile successfully, 24.8% pass execution. Error breakdown in Tables 3-4.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "ChatGPT substantially outperforms existing learning-based technique AthenaTest in test correctness (42.1% vs 18.8% compilation, 24.8% vs 14.4% passing).",
    378       "evidence": "Table 2 presents direct comparison on the same 1,000-method benchmark. AthenaTest was reproduced by fine-tuning CodeT5 on the same dataset.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Passing ChatGPT-generated tests achieve comparable coverage, readability, and usability to manually-written tests.",
    383       "evidence": "Table 5 shows 82.3% statement coverage (vs 84.2% manual). User study with 5 participants (Figures 5-6, Sections 4.3-4.4) shows comparable readability and usability scores.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "85.5% of ChatGPT's execution failures are caused by assertion errors (incorrect assertions), not runtime exceptions.",
    388       "evidence": "Table 4 shows 148 of 173 execution failures are assertion errors. Manual inspection confirmed all are due to incorrect assertions, not bugs in the focal method.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "ChatTester improves compilable rate by 34.3% and passing rate by 18.7% over default ChatGPT.",
    393       "evidence": "Table 7 (Section 6.2.1) shows ChatTester achieves 73.3% compilation (vs 39.0%) and 41.0% passing (vs 22.3%) on a separate 100-method evaluation set, averaged over 3 runs.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Both ChatTester components (initial test generator and iterative test refiner) positively contribute to effectiveness.",
    398       "evidence": "Table 7 ablation: ChatTester-Ite (intention only) achieves 50.7%/29.7%, ChatTester-Ini (iterative only) achieves 60.6%/34.0%, full ChatTester achieves 73.3%/41.0%.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "ChatTester generalizes to open-source LLMs: 21%/23% compilation improvement and 11%/18% passing improvement for CodeLlama-Instruct-34B and CodeFuse-34B.",
    403       "evidence": "Table 9 (Section 6.2.2) shows improvements on the same 100-method evaluation set for both open-source models.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "Very small user study",
    410       "detail": "Only 5 participants with 4-5 years Java experience evaluate readability and usability. No inter-rater reliability is reported. Participants were paid $250 each, which could influence responses. Recruitment method is not described, raising selection bias concerns."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "All comparative claims between ChatGPT, ChatTester variants, and baselines rely entirely on comparing raw percentages. No p-values, confidence intervals, or significance tests are used anywhere despite making multiple improvement claims."
    415     },
    416     {
    417       "flag": "Main study not repeated",
    418       "detail": "The primary empirical study on 1,000 focal methods (RQ1-RQ4) was conducted only once. Only the smaller ChatTester evaluation (100 methods, RQ5) was repeated 3 times, and even then no spread measures were reported."
    419     },
    420     {
    421       "flag": "Survivorship bias in coverage comparison",
    422       "detail": "Table 5 coverage is calculated only on focal methods where both ChatGPT and Evosuite generate executable tests, biasing toward easier-to-test methods and potentially inflating ChatGPT's coverage performance relative to the full benchmark."
    423     },
    424     {
    425       "flag": "Baseline reimplementation",
    426       "detail": "AthenaTest was reimplemented by the authors using CodeT5 instead of the original BART model since the original model was not released. This introduces risk that the baseline may be weaker than the original, but the authors acknowledge this limitation."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Unit test case generation with transformers and focal context",
    432       "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy", "Shao Kun Deng", "Neel Sundaresan"],
    433       "year": 2020,
    434       "arxiv_id": "2009.05617",
    435       "relevance": "AthenaTest: foundational learning-based unit test generation approach using transformers, used as a baseline in this paper."
    436     },
    437     {
    438       "title": "TOGA: A Neural Method for Test Oracle Generation",
    439       "authors": ["Elizabeth Dinella", "Gabriel Ryan", "Todd Mytkowicz", "Shuvendu K. Lahiri"],
    440       "year": 2022,
    441       "relevance": "Neural approach for generating test oracles, directly relevant to LLM-based test generation methodology."
    442     },
    443     {
    444       "title": "CodaMosa: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
    445       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri", "Siddhartha Sen"],
    446       "year": 2023,
    447       "relevance": "Combines LLM-generated tests with search-based testing to escape coverage plateaus; complementary approach to ChatTester."
    448     },
    449     {
    450       "title": "Large Language Models are Few-shot Testers: Exploring LLM-based General Bug Reproduction",
    451       "authors": ["Sungmin Kang", "Juyeon Yoon", "Shin Yoo"],
    452       "year": 2023,
    453       "relevance": "LIBRO: uses LLMs for test generation from bug reports, exploring few-shot LLM capabilities for testing."
    454     },
    455     {
    456       "title": "Learning Deep Semantics for Test Completion",
    457       "authors": ["Pengyu Nie", "Rahul Banerjee", "Junyi Jessy Li", "Raymond J. Mooney", "Milos Gligoric"],
    458       "year": 2023,
    459       "relevance": "Teco: learning-based test completion technique using deep semantics, a related approach to LLM-based test generation."
    460     },
    461     {
    462       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    463       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    464       "year": 2024,
    465       "relevance": "TestPilot: concurrent empirical evaluation of LLMs for unit test generation with a different feedback mechanism."
    466     },
    467     {
    468       "title": "Self-collaboration Code Generation via ChatGPT",
    469       "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"],
    470       "year": 2023,
    471       "arxiv_id": "2304.07590",
    472       "relevance": "Explores ChatGPT's self-collaboration capabilities for code generation, related to using LLMs for SE tasks."
    473     },
    474     {
    475       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    476       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    477       "year": 2023,
    478       "arxiv_id": "2304.00385",
    479       "relevance": "Demonstrates conversational ChatGPT for automated program repair, related iterative LLM approach for SE tasks."
    480     },
    481     {
    482       "title": "Software Testing with Large Language Model: Survey, Landscape, and Vision",
    483       "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen", "Zhe Liu", "Song Wang", "Qing Wang"],
    484       "year": 2023,
    485       "arxiv_id": "2307.07221",
    486       "relevance": "Survey of LLMs for software testing providing broader context for the unit test generation landscape."
    487     },
    488     {
    489       "title": "EvoSuite: automatic test suite generation for object-oriented software",
    490       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    491       "year": 2011,
    492       "relevance": "State-of-the-art search-based test generation tool, used as a primary baseline for evaluating ChatGPT."
    493     },
    494     {
    495       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-level Code Generation",
    496       "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"],
    497       "year": 2023,
    498       "arxiv_id": "2308.01861",
    499       "relevance": "LLM code generation benchmark that evaluates class-level capabilities, relevant to understanding LLM coding ability."
    500     },
    501     {
    502       "title": "Retrieval-Based Prompt Selection for Code-Related Few-Shot Learning",
    503       "authors": ["Noor Nashid", "Mifta Sintaha", "Ali Mesbah"],
    504       "year": 2023,
    505       "relevance": "Proposes retrieval-based prompt construction for LLM test assertion generation, a related prompting strategy for code tasks."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "ChatTester is a usable tool with released code, but requires ChatGPT API integration and is specific to Java unit testing."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "The 24.8% pass rate quantifies suspected limitations but does not fundamentally challenge expectations about LLM code generation quality."
    516     },
    517     "fear_safety": {
    518       "score": 0,
    519       "justification": "No AI safety, security, or risk concerns are raised by this work."
    520     },
    521     "drama_conflict": {
    522       "score": 0,
    523       "justification": "No controversy or conflict angle; straightforward empirical evaluation and improvement."
    524     },
    525     "demo_ability": {
    526       "score": 2,
    527       "justification": "Code and data released on GitHub (FudanSELab/ChatTester), though setup requires Java projects and API access."
    528     },
    529     "brand_recognition": {
    530       "score": 3,
    531       "justification": "Directly evaluates ChatGPT, one of the most widely-recognized AI products, with the name prominently in the title."
    532     }
    533   }
    534 }

Impressum · Datenschutz