scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24572B)
      1 {
      2   "paper": {
      3     "title": "Codexity: Secure AI-assisted Code Generation",
      4     "authors": ["Sung Yong Kim", "Zhiyu Fan", "Yannic Noller", "Abhik Roychoudhury"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2405.03927",
      8     "doi": "10.48550/arXiv.2405.03927"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper provides a GitHub link (https://github.com/Codexity-APR/Codexity) and a Zenodo archive (https://doi.org/10.5281/zenodo.10572275) in the abstract section."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The Zenodo archive and GitHub repository are listed as 'Tool and Data' in the abstract, indicating the benchmark dataset of 751 vulnerable subjects is included."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper mentions hardware (MacBook Pro 2019, Ubuntu 18.04 Server with RTX 4090) and model names but does not provide requirements.txt, Dockerfile, or detailed dependency/library version specifications."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The demo walkthrough (Appendix A) describes how to use the VSCode extension but not how to reproduce the experimental results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results in Table 2 report only point estimates (e.g., 15.9%, 39.3%) with no confidence intervals, error bars, or uncertainty quantification."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper claims Codexity prevents 60% of vulnerabilities compared to ChatGPT, but no statistical significance tests are performed to support comparative claims."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports percentage reductions with baseline context: 'reduction of 60% vulnerable programs' (from 75.9% to 15.9% for iteration repair), and 36.6% and 29.5% reductions for preshot variants. These provide enough context for effect size assessment."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The benchmark consists of 90 prompts and 990 completions. No justification is given for why this sample size is adequate, nor is any power analysis discussed."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance or standard deviation is reported. Table 2 shows single-run results. For preshot repair with temperature 0.2, no spread across multiple runs is reported."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares against ChatGPT (baseline without repair), FootPatch (a static-analysis-based APR tool), and GitHub Copilot in RQ2."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "GitHub Copilot and ChatGPT (gpt-3.5-turbo) are contemporary tools. FootPatch (2018) is older but is chosen for a specific reason: it also uses Infer for vulnerability detection, making it a relevant comparison."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "RQ3 examines the tradeoffs between the two repair strategies (iteration repair vs. preshot repair), which serves as an ablation of the framework's components. The preshot repair variants with different local models (StarCoder vs. SantaCoder) also function as ablations."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper reports multiple metrics: number/percentage of vulnerable codes generated, average generation time (seconds), and lines of code (LoC) in Table 2."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation of the generated code is performed. All vulnerability detection relies on automated static analysis tools (Infer, CppCheck). Human evaluation of code quality or correctness beyond vulnerability would be relevant but is absent."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The same 90 prompts used to construct the benchmark are used for all evaluations. There is no separation into development and test sets, and no held-out evaluation set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 1 provides a detailed breakdown of vulnerability categories (24 types including Null Dereference, Resource Leak, Buffer Overrun, etc.) with counts for each."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "In RQ2, the paper discusses FootPatch's failure to handle null dereferences due to incorrect variable name extraction. In RQ3, the paper acknowledges that preshot repair sometimes outputs comments instead of code (6.1% for StarCoder, 8.5% for SantaCoder), which are counted as vulnerable."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that preshot repair has a higher vulnerable code generation rate than iteration repair, and acknowledges that iteration repair introduces new vulnerabilities in some iterations, leading to additional repair rounds. The paper also notes preshot repair sometimes fails to produce code at all."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims 'Codexity can prevent 60% of the vulnerabilities being exposed to the software developer,' which is supported by Table 2 showing a reduction from 75.9% to 15.9% vulnerable code (iteration repair). The 751 vulnerable subjects from 90 prompts claim is also supported in Section 4.1."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper claims Codexity's repair strategies reduce vulnerabilities. This is demonstrated through controlled comparisons: same prompts processed with and without the repair framework, which constitutes adequate single-variable manipulation for this type of causal claim."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title 'Secure AI-assisted Code Generation' implies broad applicability, but the evaluation is limited to C programs, a specific set of 90 prompts, and gpt-3.5-turbo. The paper mentions future work for Java/Python in the conclusion but does not bound its claims to C. The abstract also makes general claims about 'AI programming assistants' while only testing ChatGPT and two local models."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No alternative explanations are discussed. For example, the paper does not consider whether the improvements come from simply prompting ChatGPT a second time (regardless of vulnerability feedback), or whether the static analyzers might miss certain vulnerability types, creating a false sense of security."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper states 'the latest ChatGPT (gpt-3.5-turbo)' without specifying a snapshot date or exact version (e.g., gpt-3.5-turbo-0613). StarCoder and SantaCoder are identified by name and parameter count (15.5B, 1.1B) but without specific version hashes or dates."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Figure 1 shows the conversational prompt template used for vulnerability repair, including the system message structure with placeholders for vulnerable code and SAST feedback. The actual prompt structure is visible and reproducible."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Temperature settings are reported: 0 for iteration repair, 0.2 for StarCoder (following its authors' setup), 0 and 0.8 for benchmark construction. Maximum prediction tokens of 1024 and maximum iterations of 3 are also specified."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper describes the Codexity workflow in detail in Section 3, including the iteration repair loop (Section 3.2) and preshot repair strategy (Section 3.3), with Figure 2 showing the workflow diagrams for both strategies."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4.1 and Figure 3 describe the benchmark construction pipeline in detail: 403 posts collected from ShareGPT/StackOverflow using keywords, filtered to 124 vulnerable posts via static analysis, then further refined to 90 vulnerable prompts through a two-round detection strategy with ChatGPT at two temperature settings."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion briefly mentions future work directions (other languages, fine-tuning) but does not discuss limitations of the current work."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed anywhere in the paper."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do not show. The conclusion mentions future work for Java/Python, implying the current scope is C only, but there is no explicit statement bounding the claims to C programs or specific vulnerability types detectable by CppCheck/Infer."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The Zenodo archive (doi:10.5281/zenodo.10572275) and GitHub repository are provided, which should contain the benchmark data and tool for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4.1 describes data collection in detail: 403 posts from ShareGPT and the first 700 StackOverflow posts were collected using 'c' and 'int main' as keywords, then filtered through a two-round vulnerability detection strategy."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved. The study uses publicly available code prompts from ShareGPT and StackOverflow, which are standard public data sources."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Figure 3 and Section 4.1 document the full pipeline: 403 posts → static analysis → 124 vulnerable posts → ChatGPT completions at two temperatures (1,364 completions) → static analysis → 90 vulnerable prompts with 751 vulnerable completions. Counts are provided at each stage."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This work was partially supported by a Singapore Ministry of Education (MoE) Tier 3 grant \"Automated Program Repair\", MOE-MOET32021-0001.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: National University of Singapore and Singapore University of Technology and Design. No affiliations with evaluated products (ChatGPT/OpenAI, GitHub Copilot/Microsoft)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder is the Singapore Ministry of Education, which is a government funding agency with no financial stake in the specific outcomes of the evaluation of commercial LLM tools."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial disclosure is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper uses gpt-3.5-turbo, StarCoder, and SantaCoder but does not state the training data cutoff dates for any of these models. Since the benchmark uses StackOverflow posts that could be in the training data, this is relevant."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The benchmark is constructed from StackOverflow and ShareGPT posts, which are publicly available and likely in the training data of gpt-3.5-turbo. This potential overlap is not discussed."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The StackOverflow posts used to construct the benchmark were publicly available before the training cutoff of gpt-3.5-turbo. ShareGPT data similarly predates the model. This contamination risk is not addressed."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants were involved in this study. The evaluation is entirely automated using benchmarks and static analysis tools."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper reports average generation time (e.g., 82.8s for iteration repair, 45.4s for StarCoder preshot) but does not report API costs, tokens consumed, or cost per example. RQ3 discusses cost tradeoffs qualitatively but provides no dollar amounts or token counts."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "Hardware is mentioned (MacBook Pro 2019, Ubuntu 18.04 with RTX 4090) but total computational budget (GPU hours, total API spend, total experiment time) is not stated."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Codexity with iteration repair prevents 60% of vulnerabilities compared to ChatGPT baseline.",
    287       "evidence": "Table 2 shows ChatGPT generated 751/990 (75.9%) vulnerable codes, while IR-ChatGPT generated 157/990 (15.9%), a reduction of 60 percentage points (Section 4.2, RQ1).",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Preshot repair with StarCoder reduces vulnerable code generation by 36.6% compared to ChatGPT.",
    292       "evidence": "Table 2 shows PR-StarCoder-15.5B generated 389/990 (39.3%) vulnerable codes vs. 751/990 (75.9%) for ChatGPT (Section 4.2, RQ1).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Preshot repair with SantaCoder reduces vulnerable code generation by 29.5% compared to ChatGPT.",
    297       "evidence": "Table 2 shows PR-SantaCoder-1.1B generated 459/990 (46.4%) vulnerable codes vs. 751/990 (75.9%) for ChatGPT (Section 4.2, RQ1).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "FootPatch detected 20 vulnerabilities but patched none on the benchmark.",
    302       "evidence": "Section 4.2, RQ2 describes FootPatch finding 19 null dereferences and one memory leak but failing to patch any due to incorrect variable identification.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "GitHub Copilot generates vulnerable code for 84.44% of the 90 prompts.",
    307       "evidence": "Section 4.2, RQ2 states Infer and CppCheck found 76 (84.44%) of Copilot's programs to be vulnerable.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Codexity is the first security-focused code generation framework.",
    312       "evidence": "Stated in Section 1 introduction. The related work section (Section 5) surveys prior approaches but none integrate SAST tools directly into the code generation loop for IDE use.",
    313       "supported": "weak"
    314     }
    315   ],
    316   "methodology_tags": ["benchmark-eval", "case-study"],
    317   "key_findings": "Codexity integrates LLMs with static analysis tools (Infer, CppCheck) to reduce security vulnerabilities in AI-generated code. The iteration repair strategy reduces vulnerable code generation from 75.9% to 15.9% (60 percentage point reduction) on a benchmark of 990 code completion attempts derived from 90 vulnerable prompts. Preshot repair, which uses a cheaper local model to anticipate vulnerabilities before querying the commercial LLM, achieves 36.6% (StarCoder) and 29.5% (SantaCoder) reductions with faster generation times. The approach is limited to C code and relies on static analyzers' detection capabilities.",
    318   "red_flags": [
    319     {
    320       "flag": "No statistical rigor",
    321       "detail": "All results are single-run point estimates with no confidence intervals, significance tests, or variance reporting. The 60% claim rests entirely on comparing two raw percentages."
    322     },
    323     {
    324       "flag": "No limitations section",
    325       "detail": "The paper lacks any discussion of limitations or threats to validity, which is a significant omission for a tool paper making effectiveness claims."
    326     },
    327     {
    328       "flag": "Contamination risk unaddressed",
    329       "detail": "The benchmark is constructed from StackOverflow and ShareGPT posts that are likely in gpt-3.5-turbo's training data. The model may already know these specific code patterns, affecting the validity of vulnerability generation rates."
    330     },
    331     {
    332       "flag": "Circular vulnerability definition",
    333       "detail": "Vulnerabilities are defined as whatever CppCheck and Infer detect. The paper evaluates success by whether these same tools report no vulnerability after repair. This means the evaluation only measures whether the LLM can produce code that passes the same static analyzers, not whether the code is actually secure."
    334     },
    335     {
    336       "flag": "Overbroad generalization",
    337       "detail": "The title and abstract imply general 'secure AI-assisted code generation' but the evaluation is limited to C programs, a specific set of CWE types detectable by two static analyzers, and a single commercial LLM (gpt-3.5-turbo)."
    338     },
    339     {
    340       "flag": "No correctness evaluation",
    341       "detail": "The paper evaluates only whether vulnerabilities are removed but does not check whether the repaired code is functionally correct. The repair process could introduce functional bugs while removing vulnerabilities."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    347       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    348       "year": 2022,
    349       "doi": "10.1109/SP46214.2022.9833571",
    350       "relevance": "Foundational work assessing security vulnerabilities in Copilot-generated code, directly motivating Codexity."
    351     },
    352     {
    353       "title": "How Secure is Code Generated by ChatGPT?",
    354       "authors": ["Raphaël Khoury", "Anderson R Avila", "Jacob Brunelle", "Baba Mamadou Camara"],
    355       "year": 2023,
    356       "arxiv_id": "2304.09655",
    357       "relevance": "Evaluates security of ChatGPT-generated code, providing evidence for the vulnerability problem Codexity addresses."
    358     },
    359     {
    360       "title": "Security Weaknesses of Copilot Generated Code in GitHub",
    361       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir", "Zengyang Li", "Mojtaba Shahin", "Jiaxin Yu"],
    362       "year": 2023,
    363       "arxiv_id": "2310.02059",
    364       "relevance": "Studies security weaknesses in Copilot-generated code found in real GitHub repositories."
    365     },
    366     {
    367       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    368       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    369       "year": 2023,
    370       "doi": "10.1109/SP46215.2023.10179324",
    371       "relevance": "Evaluates LLMs for vulnerability repair using CodeQL, directly related to using LLMs for security fixing."
    372     },
    373     {
    374       "title": "A New Era in Software Security: Towards Self-Healing Software via Large Language Models and Formal Verification",
    375       "authors": ["Yiannis Charalambous", "Norbert Tihanyi", "Ridhi Jain", "Youcheng Sun", "Mohamed Amine Ferrag", "Lucas C Cordeiro"],
    376       "year": 2023,
    377       "arxiv_id": "2305.14752",
    378       "relevance": "Explores LLMs combined with formal verification for automated vulnerability repair."
    379     },
    380     {
    381       "title": "Large Language Models for Code: Security Hardening and Adversarial Testing",
    382       "authors": ["Jingxuan He", "Martin Vechev"],
    383       "year": 2023,
    384       "relevance": "Proposes learning strategies for improved secure code generation with LLMs."
    385     },
    386     {
    387       "title": "Demystifying GPT Self-Repair for Code Generation",
    388       "authors": ["Theo X Olausson", "Jeevana Priya Inala", "Chenglong Wang", "Jianfeng Gao", "Armando Solar-Lezama"],
    389       "year": 2023,
    390       "arxiv_id": "2306.09896",
    391       "relevance": "Studies LLM self-repair for code generation, relevant to iterative repair approaches."
    392     },
    393     {
    394       "title": "StarCoder: may the source be with you!",
    395       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    396       "year": 2023,
    397       "arxiv_id": "2305.06161",
    398       "relevance": "One of the code generation models used in Codexity's preshot repair strategy."
    399     },
    400     {
    401       "title": "SantaCoder: don't reach for the stars!",
    402       "authors": ["Loubna Ben Allal", "Raymond Li", "Denis Kocetkov"],
    403       "year": 2023,
    404       "arxiv_id": "2301.03988",
    405       "relevance": "Another code generation model used in Codexity's preshot repair strategy."
    406     },
    407     {
    408       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    409       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    410       "year": 2023,
    411       "arxiv_id": "2305.05176",
    412       "relevance": "Discusses using multiple LLMs to balance cost and quality, relevant to Codexity's multi-model approach."
    413     },
    414     {
    415       "title": "Trust Enhancement Issues in Program Repair",
    416       "authors": ["Yannic Noller", "Ridwan Shariffdeen", "Xiang Gao", "Abhik Roychoudhury"],
    417       "year": 2022,
    418       "doi": "10.1145/3510003.3510040",
    419       "relevance": "Discusses trust issues in automated program repair, relevant to reliability of AI-assisted code fixes."
    420     }
    421   ]
    422 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs