scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31124B)
      1 {
      2   "paper": {
      3     "title": "LLM-Powered Test Case Generation for Detecting Bugs in Plausible Programs",
      4     "authors": [
      5       "Kaibo Liu",
      6       "Zhenpeng Chen",
      7       "Yiyang Liu",
      8       "Jie M. Zhang",
      9       "Mark Harman",
     10       "Yudong Han",
     11       "Yun Ma",
     12       "Yihong Dong",
     13       "Ge Li",
     14       "Gang Huang"
     15     ],
     16     "year": 2024,
     17     "venue": "Annual Meeting of the Association for Computational Linguistics",
     18     "arxiv_id": "2404.10304",
     19     "doi": "10.18653/v1/2025.acl-long.20"
     20   },
     21   "scan_version": 3,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval"],
     24   "key_findings": "TrickCatcher, an LLM-powered test case generation approach combining PUT-guided program variant generation, generator-based input generation, and diversity-driven differential testing, achieves F1 scores of 41.31%, 42.35%, and 51.34% on TrickyBugs (C++), TrickyBugs (Python), and EvalPlus respectively, significantly outperforming the best baseline (Differential Prompting Plus) with up to 1.66× F1 improvement. The approach generates up to 16× fewer false positives than baselines on correct programs. An ablation study confirms each of the three components contributes meaningfully, and the counterintuitive diversity-driven oracle selection (trusting outputs that differ from the PUT) outperforms traditional majority voting.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The abstract states 'Code and data used are available at https://github.com/RinCloud/TrickCatcher' and provides a working GitHub URL."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Both datasets used are publicly available: TrickyBugs (MIT license) and EvalPlus (Apache 2.0 license). The paper also states code and data are available at the GitHub repository."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are mentioned in the paper. Only the LLM model name and the CYaRon Python library are mentioned."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology is described algorithmically but there is no 'Reproducing Results' section or equivalent."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Table 1 and all results report only point estimates (e.g., '41.31%' F1) with no confidence intervals or error bars, despite the authors performing multiple repetitions."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims TrickCatcher 'significantly outperforming' baselines (Section 6.1) but provides no statistical significance tests (no p-values, t-tests, or similar). Claims of superiority are based solely on comparing point estimates."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper reports relative improvements with context: '1.80×, 2.65×, and 1.66× those of the state-of-the-art baseline' and shows absolute baseline values alongside TrickCatcher values in Table 1, allowing readers to assess magnitude."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification is given for the sample sizes (366 human-written and 151 AI-generated programs). The datasets are used as-is without discussion of whether these sizes are sufficient for the claims made."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "Despite performing combinatorial repetitions (Appendix B describes C(k,10) rounds and averaging over 100 inputs), no standard deviation, IQR, or other spread measure is reported in any table or figure. Only averaged values are shown."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Three baselines are compared: DirectChat (CHAT), Differential Prompting Plus (DPP), and Automated Program Repair (APR), described in Section 5.4."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Differential Prompting (Li et al., 2023) is described as 'the state-of-the-art in LLM-based test case generation.' The baselines are from 2023, which is recent relative to the 2024 submission."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "RQ3 (Section 6.3, Table 2) presents a thorough ablation study with 6 patterns combining different variants of program generation, input generation, and differential testing components."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Three metrics are used: recall, precision, and F1 score, all reported in Table 1 and throughout the evaluation."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Evaluation is entirely automated using canonical programs and checkers. While input validity for TrickyBugs was 'manually verified' (Section 5.3), there is no human evaluation of the system's test case outputs."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "TrickyBugs and EvalPlus are established benchmarks with canonical solutions and additional test cases. The datasets provide ground truth for evaluation that is separate from any development data."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down by dataset (TrickyBugs C++, TrickyBugs Python, EvalPlus) in Table 1, by task difficulty (RQ5, Figures 7-8), and across different numbers of program variants k."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "RQ2 (Section 6.2, Figure 5) analyzes false positives and their categories (incorrect oracles vs. invalid inputs). Section 7.1 discusses buggy program variants. RQ5 discusses where TrickCatcher has less advantage."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper reports that 40.10% of directly generated test inputs are invalid (Section 1), that recall can decrease with more variants for DPP (Table 1), and that TrickCatcher's recall on TrickyBugs Python is actually slightly lower than DPP in the best-vs-best comparison (-10.61%)."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract's claims of 1.80×, 2.65×, and 1.66× recall/precision/F1 improvements are supported by Table 1's best-vs-best comparisons. F1 scores of 41.31%, 42.35%, and 51.34% are verified in the same table."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The ablation study (Table 2, RQ3) provides controlled single-variable manipulation across 6 patterns, systematically varying one component at a time to justify claims like 'each component of TrickCatcher contributes meaningfully to its overall performance.'"
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Claims are bounded to the tested datasets and languages. The title 'Detecting Bugs in Plausible Programs' accurately describes the scope. Section 7.2 tests generalization to deepseek-v3 but frames it appropriately as verifying 'generalization capability' rather than claiming universal applicability."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper does not discuss alternative explanations for why TrickCatcher outperforms baselines. For example, it does not consider whether the improvement is primarily due to having access to the PUT (which DPP does not) vs. the other design choices, nor whether the results might be explained by the specific characteristics of the evaluation datasets."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures TP, FP, precision, recall, and F1 for bug detection, which directly maps to the claimed capability. No proxy gap exists — 'detecting bugs in plausible programs' is exactly what is measured."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 5.5 specifies 'gpt-3.5-turbo-0125' as the exact model version, including the snapshot date suffix. deepseek-v3 is also named in Section 7.2."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Figures 3 and 4 provide the actual prompt templates used for program variant generation and test input generator generation, including the instruction text and placeholders clearly labeled."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No LLM API hyperparameters (temperature, top-p, max tokens, sampling settings) are reported anywhere in the paper despite multiple LLM calls being central to the approach."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "TrickCatcher is a sequential pipeline (generate variants → generate inputs → differential testing), not an agentic scaffolding system. There is no retry logic, feedback loops, memory, or tool use."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5.2 describes how programs were selected: TrickyBugs provides 251 C++ and 115 Python plausible programs; EvalPlus programs were filtered from pre-generated LLM code samples to obtain those passing base tests but failing extra tests, yielding 151 tasks."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing three specific limitations: model budget constraints, LLM uncertainty, and data leakage risk."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The Limitations section identifies specific threats: (1) only two models used due to budget, suggesting more advanced LLMs could improve results; (2) LLM behavior uncertainty mitigated by multiple repetitions and averaging; (3) data leakage addressed by noting TrickyBugs was released after the model and EvalPlus prohibits training use."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. The Limitations mention budget constraints and suggest future work but do not clearly bound what claims should not be extrapolated from these results."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Code and data are available at https://github.com/RinCloud/TrickCatcher. Both source datasets (TrickyBugs, EvalPlus) are publicly available under MIT and Apache 2.0 licenses."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 5.2 describes both datasets: TrickyBugs contains programs from an online judge platform, and EvalPlus programs were filtered from pre-generated LLM code samples with specific inclusion criteria (pass base tests, fail extra tests)."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. The data comes from standard benchmarks (TrickyBugs and EvalPlus)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The pipeline is documented: for EvalPlus, programs are filtered from pre-generated samples to find those passing base but failing extra tests (resulting in 151 tasks). For TrickyBugs, 251 C++ and 115 Python plausible programs are selected. Appendix B documents the repetition and averaging pipeline."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The Acknowledgements section lists specific grants: National Key R&D Program (2023YFB4503801), NSFC grants (62192733, 62192730), Hubei Province program (2023BAA024), and ITEA projects for Jie M. Zhang."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All author affiliations are listed: Peking University, Nanyang Technological University, King's College London, University College London, and National Key Laboratory of Data Space Technology and System."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Funding comes from government research programs (NSFC, National Key R&D, Hubei Province, InnovateUK) with no commercial interest in the evaluation outcome."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "The paper does not state the training data cutoff date for gpt-3.5-turbo-0125 or deepseek-v3. The Limitations section mentions TrickyBugs was released after the model but does not state the actual cutoff."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The Limitations section discusses this: 'the TrickyBugs dataset we used was released after gpt-3.5-turbo-0125, and EvalPlus explicitly prohibits its use for training LLMs. Moreover, the poor performance of the three LLM-based baselines further suggests that data leakage is not a main concern.'"
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "The Limitations section addresses contamination: TrickyBugs released after the model, EvalPlus prohibits training use, and poor baseline performance is cited as evidence against leakage being a factor."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study. All evaluation is automated on benchmark datasets."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. The study evaluates automated test case generation on code benchmarks."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference costs, API costs, or latency figures are reported despite the approach requiring hundreds of LLM calls per PUT (generating 10 variants + input generators + 100 test inputs)."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total computational budget is stated. The paper mentions 'budget constraints' as motivation for using gpt-3.5-turbo but does not quantify the actual spend or compute used."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "While Appendix B describes a combinatorial repetition scheme, no variance or sensitivity across runs is reported. All tables show only averaged point estimates."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Appendix B explicitly states: 100 test inputs sampled, 10 program variants sampled, with C(k,10) combinatorial rounds. For CHAT: 100 test cases sampled. For APR: 10 patches sampled."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No mention of any hyperparameter search. The number of program variants k is varied as a parameter study, but no search budget for other hyperparameters (e.g., prompt design, number of inputs) is reported."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": true,
    322         "justification": "Results are reported for all k values (2, 4, 6, 8, 10) in Table 1, with multiple comparison methods (Average, Best vs. Best, Worst vs. Worst). No cherry-picking of a single best configuration."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors implement their own version of the DPP baseline (modified to DPP+ for fair comparison) and do not acknowledge the bias of evaluating their own system against their own re-implementation of the baseline."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "TrickCatcher involves additional compute over DPP (PUT-guided generation, input generator creation + execution) but no comparison of compute budgets between methods is provided."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper does not discuss whether TrickyBugs and EvalPlus actually represent real-world bug detection scenarios, or whether plausible programs from competitive programming are representative of bugs encountered in production software."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "All methods use the same underlying LLM (gpt-3.5-turbo-0125) with different pipelines. The pipeline IS the thing being tested, so no scaffold confound exists."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "The Limitations section notes that 'the TrickyBugs dataset we used was released after gpt-3.5-turbo-0125,' directly addressing temporal ordering between model training and benchmark availability."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether the evaluation setup leaks information. For instance, providing the PUT alongside the specification to the LLM could be considered a form of information leakage compared to real-world scenarios."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether programs in TrickyBugs or EvalPlus share structural similarities, come from overlapping sources, or whether the competitive programming tasks are independent of each other."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No concrete leakage detection method is applied. The contamination discussion is entirely conceptual (dataset release dates, license terms, baseline performance argument)."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "TrickCatcher achieves recall, precision, and F1 scores that are 1.80×, 2.65×, and 1.66× those of the state-of-the-art baseline (DPP).",
    376       "evidence": "Table 1 shows best-vs-best comparisons across TrickyBugs (C++, Python) and EvalPlus datasets. F1 scores: 41.31%, 42.35%, 51.34% vs DPP's 24.95%, 36.20%, 35.76%.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "TrickCatcher generates up to 16× fewer false positives for correct programs compared to baselines.",
    381       "evidence": "Figure 5 (RQ2) shows FP counts on EvalPlus canonical programs: TrickCatcher (k=2) generates 2.57 FPs vs DPP (k=10) at 26.33+1.67 FPs. The 16× figure appears to compare specific configurations.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Each component of TrickCatcher (PUT-guided generation, generator-based input generation, diversity-driven differential testing) contributes to performance.",
    386       "evidence": "Table 2 ablation study (RQ3) with 6 patterns on TrickyBugs (C++) shows systematic improvement when adding each component, comparing patterns 2 vs 3 (differential testing), 4 vs 6 (input generation), and 3/5 vs 6 (program generation).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "TrickCatcher's performance remains consistently stable with different numbers of program variants.",
    391       "evidence": "Figure 6 (RQ4) shows TrickCatcher's precision and F1 remain stable across k=2 to k=10, while DPP fluctuates significantly.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "TrickCatcher demonstrates more significant improvement over DPP on more difficult coding tasks.",
    396       "evidence": "Figures 7 and 8 (RQ5) show difficulty distribution comparisons and program variant passing rates grouped by difficulty. TrickCatcher's advantage is more pronounced on high-difficulty tasks.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Generator-based input generation produces valid inputs, eliminating false positives due to invalid inputs.",
    401       "evidence": "Figure 5 shows TrickCatcher produces zero FPs from invalid inputs across all k values, while 40.10% of directly generated inputs are reported as invalid in Section 1.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "TrickCatcher generalizes to different language models (deepseek-v3).",
    406       "evidence": "Table 3 shows deepseek-v3 achieves 59.54% F1 (k=5) on EvalPlus, outperforming gpt-3.5-turbo's 51.34% (k=10), but this is tested on only one dataset with one additional model.",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "red_flags": [
    411     {
    412       "flag": "No statistical significance tests",
    413       "detail": "The paper repeatedly claims TrickCatcher 'significantly outperforms' baselines but provides no statistical significance tests. All comparisons are based on point estimates without p-values, confidence intervals, or any formal test of whether observed differences could be due to chance."
    414     },
    415     {
    416       "flag": "No variance or uncertainty reported despite multiple runs",
    417       "detail": "Appendix B describes an elaborate combinatorial repetition scheme (C(k,10) rounds × 100 inputs), but no standard deviation, interquartile range, or any spread measure is reported. Readers cannot assess the stability of the results."
    418     },
    419     {
    420       "flag": "Modified baseline may disadvantage comparison",
    421       "detail": "The primary baseline DPP is a modified version of Differential Prompting (DPP+), adapted by the authors to handle plausible programs. The original method was not designed for this setting, and the modification may not represent the strongest possible adaptation."
    422     },
    423     {
    424       "flag": "No cost analysis despite heavy LLM usage",
    425       "detail": "The approach calls the LLM to generate 10 variants, create input generators, and produce 100 test inputs per program. With 517 programs across two datasets, this represents thousands of API calls, yet no cost or latency analysis is provided. The paper mentions 'budget constraints' as motivation for gpt-3.5-turbo but never quantifies costs."
    426     },
    427     {
    428       "flag": "Missing hyperparameters",
    429       "detail": "No LLM API hyperparameters (temperature, top-p, max tokens) are reported despite being critical to LLM output quality and reproducibility."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "Nuances are the key: Unlocking ChatGPT to find failure-inducing tests with differential prompting",
    435       "authors": ["Tsz-On Li", "Wenxi Zong", "Yibo Wang", "Haoye Tian", "Ying Wang", "Shing-Chi Cheung", "Jeff Kramer"],
    436       "year": 2023,
    437       "relevance": "State-of-the-art LLM-based test case generation baseline using differential testing, directly compared with TrickCatcher."
    438     },
    439     {
    440       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    441       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    442       "year": 2023,
    443       "relevance": "Introduces EvalPlus benchmark used for evaluation, relevant to AI code generation quality assessment."
    444     },
    445     {
    446       "title": "Evaluating large language models trained on code",
    447       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    448       "year": 2021,
    449       "arxiv_id": "2107.03374",
    450       "relevance": "Foundational work on evaluating LLMs for code generation (Codex/HumanEval), foundational to the benchmark evaluation approach."
    451     },
    452     {
    453       "title": "TrickyBugs: A dataset of corner-case bugs in plausible programs",
    454       "authors": ["Kaibo Liu", "Yudong Han", "Yiyang Liu", "Jie M. Zhang", "Zhenpeng Chen", "Federica Sarro", "Gang Huang", "Yun Ma"],
    455       "year": 2024,
    456       "relevance": "Primary evaluation dataset containing human-written plausible programs with tricky bugs from online judge platforms."
    457     },
    458     {
    459       "title": "Large language model-based agents for software engineering: A survey",
    460       "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen", "Xin Peng", "Zhenpeng Chen", "Lingming Zhang", "Yiling Lou"],
    461       "year": 2024,
    462       "arxiv_id": "2409.02977",
    463       "relevance": "Comprehensive survey of LLM-based agents in software engineering, providing context for the broader field."
    464     },
    465     {
    466       "title": "Evaluating and improving ChatGPT for unit test generation",
    467       "authors": ["Zhiqiang Yuan", "Mingwei Liu", "Shiji Ding", "Kaixin Wang", "Yixuan Chen", "Xin Peng", "Yiling Lou"],
    468       "year": 2024,
    469       "relevance": "LLM-based test generation approach (ChatTester) compared as related work on using LLMs for software testing."
    470     },
    471     {
    472       "title": "An empirical evaluation of using large language models for automated unit test generation",
    473       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    474       "year": 2024,
    475       "relevance": "Empirical evaluation of LLM-based test generation (TestPilot), directly relevant to the landscape of AI-powered testing."
    476     },
    477     {
    478       "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM",
    479       "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang", "Xiaofei Ma", "Murali Krishna Ramanathan", "Baishakhi Ray"],
    480       "year": 2024,
    481       "relevance": "Coverage-guided LLM test generation approach (SymPrompt), related work on using LLMs for test case generation."
    482     },
    483     {
    484       "title": "The counterfeit conundrum: Can code language models grasp the nuances of their incorrect generations?",
    485       "authors": ["Alex Gu", "Wen-Ding Li", "Naman Jain", "Theo Olausson", "Celine Lee", "Koushik Sen", "Armando Solar-Lezama"],
    486       "year": 2024,
    487       "relevance": "Studies whether LLMs can detect bugs in their own generated code, directly relevant to detecting bugs in plausible programs."
    488     },
    489     {
    490       "title": "B4: Towards optimal assessment of plausible code solutions with plausible tests",
    491       "authors": ["Mouxiang Chen", "Zhongxin Liu", "He Tao", "Yusu Hong", "David Lo", "Xin Xia", "Jianling Sun"],
    492       "year": 2024,
    493       "relevance": "Addresses assessment of plausible code solutions using plausible tests, closely related to bug detection in plausible programs."
    494     },
    495     {
    496       "title": "Who judges the judge: An empirical study on online judge tests",
    497       "authors": ["Kaibo Liu", "Yudong Han", "Jie M. Zhang", "Zhenpeng Chen", "Federica Sarro", "Mark Harman", "Gang Huang", "Yun Ma"],
    498       "year": 2023,
    499       "relevance": "Identified 3,440 tricky bugs in human-written programs on online judge platforms, motivating the TrickCatcher approach."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 2,
    505       "justification": "Practitioners working on test generation or CI/CD pipelines could use TrickCatcher to detect bugs in plausible programs, and code is publicly available."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "The diversity-driven oracle selection (trusting minority outputs over majority voting) is counterintuitive but not a major paradigm challenge."
    510     },
    511     "fear_safety": {
    512       "score": 0,
    513       "justification": "No AI safety or security concerns raised; the work is about improving software testing."
    514     },
    515     "drama_conflict": {
    516       "score": 0,
    517       "justification": "No controversy or conflict in the findings."
    518     },
    519     "demo_ability": {
    520       "score": 2,
    521       "justification": "Code is available on GitHub at https://github.com/RinCloud/TrickCatcher, though it requires API keys and benchmark data setup."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Mark Harman (UCL) is well-known in software engineering research, and Peking University has strong brand recognition in CS, but not mainstream fame."
    526     }
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs