scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19845B)
      1 {
      2   "paper": {
      3     "title": "CHATASSERT: LLM-Based Test Oracle Generation With External Tools Assistance",
      4     "authors": ["Ishrak Hayet", "Adam Scott", "Marcelo d'Amorim"],
      5     "year": 2025,
      6     "venue": "IEEE Transactions on Software Engineering",
      7     "doi": "10.1109/TSE.2024.3519159"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/ncsu-swat/chatassert (Section I, end of contributions list)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "They use the publicly available TECO dataset. The paper states 'We evaluate CHATASSERT on a part of the evaluation set of the TECO dataset' which is publicly available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Tools like JavaParser and UniXCoder are mentioned but no version details or environment specs are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README instructions or reproduction scripts are described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates in tables (e.g., Acc@1 = 0.45). No confidence intervals or error bars are provided."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Chi-Square tests with p-values are reported for key comparisons (e.g., 'Chi-Square value is 66.76 with a p-value of 3.04 × 10^-16' in RQ2; 'Chi-Square value is 13.02 with a p-value of 0.0003' in RQ3). Mann-Whitney U implied for mutation strength (p-value < 0.05)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Cohen's h reported for accuracy comparisons (1.3 for RQ2, 0.58 for RQ3) and Cliff's Delta for mutation strength (0.51 and 0.38)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Sample size of 500 is justified: 'We randomly sample 500 distinct samples from a total of 3,540 examples in the TECO dataset for a 98% confidence level and 5% error margin' (Section V-A3)."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviation, variance, or spread measures are reported across runs. Results appear to be single-run numbers. The stochastic nature of LLM outputs means variance across runs would be relevant."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines: ATLAS, TOGA, TECO (prior work), plus four LLMs without CHATASSERT's prompt engineering (CHATGPT, Mistral, Codestral, Magicoder)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "TECO (2023) is the SoTA at time of submission. Codestral and Mistral are contemporary open-source LLMs."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table VI shows ablation removing each of the four features (CS, EX, SR, DR) individually, measuring impact on all metrics."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics: Acc@1/3/5/10, BLEU, CodeBLEU, ROUGE, EditSim, and mutation scores."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated oracles. Evaluation is entirely automated (exact match, NLP metrics, mutation testing). Human judgment of oracle quality/usefulness would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "They use the TECO evaluation dataset (a separate test set from TECO's training data), with a randomly sampled subset of 500 examples."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No per-project or per-category breakdown of results. Only aggregate numbers across all 500 samples are reported."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section VI-A discusses negative examples where CHATASSERT fails (anonymous inner classes, specific test cases). Fig. 9 shows concrete failure cases."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that TECO outperforms CHATASSERT on some NLP metrics (EditSim, CodeBLEU in some configs), and that EX feature had less impact. Also notes CHATASSERT-DR has lower NLP metrics than TECO (Table V)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims 'improves Acc@1 by 15%' is supported by Table IV (CHATASSERT 0.45 vs TECO 0.30). 'Improves Acc@10 by 12%' matches Table IV (0.54 vs 0.42)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about feature contributions are supported by controlled ablation study (Table VI) where each feature is removed individually. The ablation design is adequate single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract claim generality for 'LLM-based oracle generation' but results are only on Java projects from the TECO dataset. No explicit bounding to Java or to this specific dataset's characteristics."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section is generic. No substantive discussion of alternative explanations for the improvements — e.g., whether ChatGPT may have seen TECO dataset projects in training, or whether the improvement comes from additional compute (more LLM queries) rather than the specific features."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'GPT3.5' / 'CHATGPT' without specifying a version or snapshot date (e.g., gpt-3.5-turbo-0613). Mistral-7B and Magicoder-6.7B and Codestral-22B give sizes but not specific versions/snapshots."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Fig. 5 shows the CHATGPT-ONE prompt template. Figs. 2 and 4 show detailed prompt examples with actual text for the full CHATASSERT pipeline including code summarization and repair prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Temperature is mentioned conceptually ('a variable that sets the level of (un)predictability') but the actual temperature setting used is not specified. 'Default temperature' is stated for CHATGPT-ONE but the default value is not given. No other hyperparameters (top-p, max tokens) reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The iterative scaffolding is described in detail: Algorithm 1 shows the full pseudocode, Table I describes features, and the generation/repair loop with feedback mechanisms is thoroughly documented."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section V-A1 describes normalization (semantic-preserving transformations on assertions). Section V-A3 describes the sampling procedure with stratified sampling, exclusion of Hamcrest assertions, and helper method assertions."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI-C 'Threats to Validity' provides a dedicated subsection discussing construct, internal, and external validity threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: sampling strategy limitations, possibility of test cases in ChatGPT training data (with Acc@1=0.10 mitigation argument), choice of metrics, and the specific dataset used."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statement of what the results do NOT show. The paper does not bound its claims to Java, to the TECO dataset characteristics, or state what populations/settings are excluded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (individual per-sample results, LLM outputs) is made available. Only aggregate metrics in tables."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection is described: they use the TECO evaluation dataset with a stratified random sampling procedure selecting 500 samples from 3,540, with specific inclusion/exclusion criteria (Section V-A3)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from a standard benchmark dataset (TECO)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The sampling pipeline is documented: 51 projects → 10 per project → exclude Hamcrest/helper methods → fill to 500. The evaluation pipeline (compile, run, match) is described in Algorithm 1."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "NSF grants disclosed: 'This work was supported by the National Science Foundation under Grant 2319472 and Grant 2349961.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with North Carolina State University. No evaluated product is from their institution."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSF is a government funding agency with no financial interest in whether CHATASSERT outperforms baselines."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff date stated for any of the models used (GPT-3.5, Mistral, Codestral, Magicoder)."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section VI-C discusses this threat: 'Since ChatGPT is a closed model and the training data are not available to end users... it is difficult to determine whether a test case belongs to the training dataset.' They use low Acc@1 (0.10) as evidence against significant contamination."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The TECO dataset uses open-source Java projects that predate GPT-3.5's training. While contamination is mentioned in threats, no concrete analysis is done (e.g., temporal analysis, canary strings). The low Acc@1 argument is weak evidence."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section VI-B2 reports '∼10m, on average, for CHATASSERT to generate an assertion' and discusses the cost implications of multiple ChatGPT API calls."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend, GPU hours, or overall computational budget is stated. Only average per-assertion time is mentioned."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CHATASSERT improves Acc@1 by 15% over TECO (SoTA)",
    286       "evidence": "Table IV: CHATASSERT Acc@1 = 0.45 vs TECO Acc@1 = 0.30. Chi-Square = 13.02, p = 0.0003, Cohen's h = 0.58.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CHATASSERT's prompt engineering significantly improves all tested LLMs",
    291       "evidence": "Table III: CHATASSERT-CHATGPT Acc@10 = 0.54 vs CHATGPT Acc@10 = 0.28. Chi-Square = 66.76, p = 3.04e-16, Cohen's h = 1.3. Similar improvements for Mistral, Magicoder, Codestral.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "All four features (CS, EX, SR, DR) contribute positively to CHATASSERT's performance",
    296       "evidence": "Table VI ablation: removing any feature reduces Acc@10 by 6-7%. CS has largest impact on Acc@1 (11 ppt drop).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "CHATASSERT-generated oracles have significantly higher mutation strength than TECO",
    301       "evidence": "Fig. 6(b): p-value < 0.05, Cliff's Delta = 0.51 (medium effect). CHATASSERT-DR also higher: Cliff's Delta = 0.38.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "CHATASSERT-DR (without dynamic repair) still outperforms TECO",
    306       "evidence": "Table V: CHATASSERT-DR Acc@1 = 0.35 vs TECO 0.30, Acc@10 = 0.47 vs 0.42.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CHATASSERT, a prompt engineering framework for LLM-based test oracle generation using code summarization, examples, and static/dynamic repair, improves Acc@1 by 15 percentage points over the state-of-the-art TECO on the TECO dataset (0.45 vs 0.30). The approach generalizes across four LLMs (ChatGPT, Mistral, Codestral, Magicoder) with significant improvements. Ablation shows all four features contribute, with code summarization being most impactful for Acc@1. Generated oracles also demonstrate significantly higher mutation-killing ability than TECO's.",
    312   "red_flags": [
    313     {
    314       "flag": "No variance across runs",
    315       "detail": "LLM outputs are stochastic, yet all results appear to be single-run. No standard deviation or multiple-run analysis is reported despite using temperature-dependent API calls."
    316     },
    317     {
    318       "flag": "Model versions unspecified",
    319       "detail": "GPT-3.5 is used without a snapshot version. Model behavior changes across versions, making reproduction uncertain."
    320     },
    321     {
    322       "flag": "Weak contamination argument",
    323       "detail": "The argument that low Acc@1 (0.10) means ChatGPT hasn't seen the test cases is weak — partial memorization could still inflate results without producing exact matches."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "Learning deep semantics for test completion",
    329       "authors": ["P. Nie", "R. Banerjee", "J. J. Li", "R. J. Mooney", "M. Gligoric"],
    330       "year": 2023,
    331       "relevance": "TECO is the SoTA baseline for test oracle generation that CHATASSERT outperforms."
    332     },
    333     {
    334       "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    335       "authors": ["C. Lemieux", "J. P. Inala", "S. K. Lahiri", "S. Sen"],
    336       "year": 2023,
    337       "relevance": "LLM-augmented test generation technique combining search-based testing with LLMs."
    338     },
    339     {
    340       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    341       "authors": ["C. S. Xia", "L. Zhang"],
    342       "year": 2024,
    343       "relevance": "ChatRepair uses conversational LLM interaction for program repair, similar iterative approach to CHATASSERT."
    344     },
    345     {
    346       "title": "Evaluating large language models trained on code",
    347       "authors": ["M. Chen"],
    348       "year": 2021,
    349       "arxiv_id": "2107.03374",
    350       "relevance": "Codex/HumanEval benchmark paper foundational to LLM code generation evaluation."
    351     },
    352     {
    353       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    354       "authors": ["J. Wei"],
    355       "year": 2022,
    356       "relevance": "CHATASSERT's code summarization pipeline is analogous to chain-of-thought prompting."
    357     },
    358     {
    359       "title": "TOGA: A neural method for test oracle generation",
    360       "authors": ["E. Dinella", "G. Ryan", "T. Mytkowicz", "S. K. Lahiri"],
    361       "year": 2022,
    362       "relevance": "Grammar-based neural oracle generation baseline compared against CHATASSERT."
    363     },
    364     {
    365       "title": "On learning meaningful assert statements for unit test cases",
    366       "authors": ["C. Watson", "M. Tufano", "K. Moran", "G. Bavota", "D. Poshyvanyk"],
    367       "year": 2020,
    368       "relevance": "ATLAS: early deep learning approach for assertion generation, baseline in this study."
    369     },
    370     {
    371       "title": "Neural-based test oracle generation: A large-scale evaluation and lessons learned",
    372       "authors": ["S. B. Hossain", "A. Filieri", "M. B. Dwyer", "S. Elbaum", "W. Visser"],
    373       "year": 2023,
    374       "relevance": "Large-scale evaluation of neural oracle generation techniques including TOGA."
    375     },
    376     {
    377       "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning",
    378       "authors": ["C. S. Xia", "L. Zhang"],
    379       "year": 2022,
    380       "relevance": "Zero-shot LLM-based program repair, related to CHATASSERT's repair features."
    381     },
    382     {
    383       "title": "Code generation tools (almost) for free? A study of few-shot, pre-trained language models on code",
    384       "authors": ["P. Bareiß", "B. Souza", "M. d'Amorim", "M. Pradel"],
    385       "year": 2022,
    386       "arxiv_id": "2206.01335",
    387       "relevance": "Study of few-shot LLM code generation by overlapping authors, relevant to LLM code capabilities."
    388     }
    389   ]
    390 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs