ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27884B)


      1 {
      2   "paper": {
      3     "title": "AI Code in the Wild: Measuring Security Risks and Ecosystem Shifts of AI-Generated Code in Modern Software",
      4     "authors": [
      5       "Bin Wang",
      6       "Wenjie Yu",
      7       "Yilu Zhong",
      8       "Hao Yu",
      9       "Keke Lian",
     10       "Chaohua Lu",
     11       "Hongfang Zheng",
     12       "Dong Zhang",
     13       "Hui Li"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv",
     17     "arxiv_id": "2512.18567"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The abstract states 'We will open-source the complete dataset and release analysis artifacts' and Section 3 says 'We will open-source the corresponding detection model and dataset.' Both are promises of future release, not actual releases. No repository URL or archive link is provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Same as above — the dataset and analysis artifacts are promised for future release but no download link or repository URL is provided in the paper."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment specifications, requirements files, Dockerfiles, or library version listings are provided anywhere in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level in Sections 3-4 but without specific commands, scripts, or a reproduction guide."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as point estimates throughout (e.g., accuracy 0.718, precision 0.716). No confidence intervals, error bars, or uncertainty quantification is provided for any of the main results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 6.3 uses the Mann-Whitney U test to compare severity score distributions between AI-introduced and human-introduced vulnerabilities (p=0.091, alpha=0.05). The null and alternative hypotheses are explicitly stated."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "While the paper reports percentage differences (e.g., AI net impact scores like +7.6% for C#, +4.9% for PHP), no formal effect size measures (Cohen's d, odds ratios, etc.) are reported. The Mann-Whitney U test in Section 6.3 reports only p-values, not effect sizes."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why the top 1,000 GitHub repositories were chosen as the sample size, nor is there any power analysis. The choice of 7,000+ CVEs is also not justified beyond availability."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Detection framework results (accuracy, precision, recall, F1) are reported as single-run point estimates. No standard deviations, confidence intervals across runs, or variance measures are provided for the CAF evaluation or the ecosystem analysis."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 3.2 evaluates seven baseline AI-generated text detection models, and Section 3.3.1 compares CAF against these baselines (Fig. 3b)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baseline models include recent detection tools from ICLR 2024 and contemporary Hugging Face models, representing current state-of-the-art in AIGC detection."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 3.3.1 presents an ablation study comparing the full CAF model with variants: 'w/o Stage 1' (removing cascaded preprocessing) and 'w/o Stage 2' (removing weighted aggregation), shown in Fig. 3c."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The detection framework is evaluated using four metrics: Accuracy, Precision, Recall, and F1-score (Section 3.2). The security analysis uses CVSS scores, attack vector distributions, CWE type distributions, and AI net impact scores."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of the detection pipeline's outputs is reported. The accuracy of the AIGCode detector — which is the foundation of all downstream ecosystem and security claims — is evaluated only against automated labels, with no human verification of a sample of the detector's classifications on real-world commits."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3.2 states '30% of the samples from the evaluation dataset' were used for testing baselines, and Section 3.3.1 evaluates CAF 'on the held-out 70% of the evaluation set.' The split is explicit."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive per-category breakdowns are provided: by programming language (Fig. 4a), by tech stack over time (Fig. 4b), by file function category (Fig. 4c), by CWE type (Fig. 5b), and by language for vulnerability lifecycle roles (Fig. 5a)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 9 (Limitations) discusses detection accuracy limitations, acknowledging the detector 'leaves room for improvement — for example, in more finely identifying AI-originated code that has been heavily edited by humans.' The ablation study also discusses failure modes (e.g., removing Stage 2 causes 'catastrophic failure')."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3.2 reports that existing AIGC text detection models 'face significant performance challenges' when transferred to code detection. Section 3.3.1 reports ablation variants that fail dramatically (w/o Stage 2 has F1=0.012). Section 6 reports that AI's net impact on security is negative across all languages."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract's three 'ecological patterns' — (1) AI concentrates in boilerplate/tests/docs, (2) some CWE families overrepresented in AI code, (3) AI introduces high-throughput changes while humans act as gatekeepers — are all supported by specific results in Sections 5-7 with figures and statistics."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims such as 'AI-induced vulnerabilities propagated by shared models rather than shared maintainers' (abstract) and 'when review is shallow, AI-introduced defects persist longer.' These causal interpretations are drawn from observational/correlational data (classifier-based labeling of commits) without adequate causal identification strategies. The study cannot distinguish whether AI caused the vulnerabilities or whether the AI detector is correlated with certain code patterns that are independently vulnerable."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title says 'Modern Software' but the study only covers the top 1,000 GitHub repositories by stars, which is a highly non-representative sample (skewed toward popular open-source projects). Findings are presented as general statements about 'AIGCode in the wild' without adequately bounding the generalization to this specific sample. The abstract calls it 'the first large-scale empirical study of AI-generated code in the wild' but the 'wild' is limited to top-starred GitHub repos."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not substantively discuss alternative explanations for its findings. For example, the correlation between AI-detected code and vulnerability patterns could be driven by detector bias (the detector may systematically misclassify certain code styles). The Section 9 limitations mention detector accuracy but do not discuss how detection errors might systematically bias the security findings."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The AI-generated code subset was created using 11 LLMs listed in Fig. 1a (e.g., 'gpt-4o', 'claude-sonnet-4-20250514', 'gpt-3.5-turbo', 'Qwen2.5-Coder-Instruct', 'gemini-2.5-flash'). While 'claude-sonnet-4-20250514' includes a date, most others like 'gpt-4o', 'gpt-3.5-turbo', and 'grok-3' lack version specifiers or snapshot dates. The baseline detection models are referred to only by pseudonyms (HCR, SRL, ORD, DAD, Y-Zh2, Y-En2, Y-En3) with no real model names or versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Section 3.1 describes 33 programming topics refined into 165 specific tasks used to generate AI code, but the actual prompts used for code generation are not provided. The prompts or instructions given to the LLM-based judgment mechanism for CWE scanning are also not provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters for the detection framework are reported: Stage 1 confidence threshold tau_1 = 0.9, Stage 2 decision threshold tau_2 = 0.53, and the master referee weight of 2 vs auxiliary weight of 1 (Section 3.3)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper does not use agentic scaffolding. The detection framework is an ensemble of classifiers, not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.1 documents the data construction pipeline: time window for human code (2008-2010), filtering non-code files by extension, serialization. Section 4 describes the GitHub data collection (top 1000 repos by stars, 2022-2025 window, file-level granularity). The vulnerability dataset pipeline is described in Section 4 (CVE metadata retrieval, GitHub search, manual verification)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 9 is titled 'Limitations and Future Work' and contains substantive discussion of the study's limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 9 discusses specific threats: (1) the 2022-2025 time window may miss earlier AI tools, (2) the detector accuracy is not perfect and 'leaves room for improvement — for example, in more finely identifying AI-originated code that has been heavily edited by humans,' and (3) they 'deliberately do not perform extensive modeling, tuning, or error analysis of the detector itself.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "While Section 9 mentions the time window limitation and detector accuracy, the paper does not explicitly state what the results do NOT show. It does not bound the findings to top-starred GitHub repos or discuss that findings may not apply to private/enterprise codebases, smaller projects, or non-English-dominant projects."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The raw data (commit-level classifications, CVE-linked code samples, detection pipeline outputs) is not available. The paper promises future release but provides no current access."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 3.1 and 4 describe the data collection in detail: the human code subset from pre-2010 commits on high-starred repos, AI code generated from 11 LLMs across 165 tasks, GitHub top 1000 repos by stars for 2022-2025, and CVE-linked code from public intelligence sources with manual verification."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The data sources are public repositories and CVE databases, which are standard public data sources."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented: Section 3.1 describes dataset construction from collection to labeling; Section 3.3 describes the detection framework stages; Section 4 describes the in-the-wild data collection from GitHub commits and CVE sources. Sample counts are provided (40K human, 40K AI for evaluation set; top 1000 repos; 7000+ CVEs)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. No grants or sponsors are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Peking University and Tencent. The paper header lists each author's affiliation."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Several authors are affiliated with Tencent, a major technology company that develops and uses AI code generation tools. The paper does not disclose whether Tencent funded this research, nor does it discuss whether Tencent has a financial interest in the findings about AI-generated code security."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper. Given that Tencent-affiliated authors are involved, the absence of any conflict disclosure is a gap."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The detection framework is an ensemble of existing classifiers applied to code provenance detection, not a benchmark evaluation of LLM knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper does not evaluate a pre-trained model's capability on a benchmark. The detection task is classification of human vs. AI code, not model knowledge evaluation."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable for the same reasons. The paper's contribution is an empirical study of AI code prevalence and security, not a benchmark evaluation of LLM capabilities."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved. The study analyzes code repositories and CVE data."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper does not report the cost of running the detection pipeline across the top 1,000 GitHub repositories, nor the cost of generating the AI code evaluation dataset using 11 commercial and open-source LLMs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget is stated — no mention of GPU hours, API costs, total processing time, or hardware used for the detection pipeline or data generation."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "AIGCode is already a substantial fraction of new code in top GitHub repositories, with AI file rates ranging from 25.42% (Java) to 45.43% (TypeScript).",
    296       "evidence": "Section 5.1 and Fig. 4a present per-language AI file rates computed from the top 1000 GitHub repos (2022-2025).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The Cascade-Aggregation Framework achieves best overall performance with Accuracy 0.718, Precision 0.716, and F1 0.719 for AI code detection.",
    301       "evidence": "Section 3.3.1 and Fig. 3b compare CAF against seven baselines on the held-out 70% evaluation set.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "AI's net impact on security is negative across all analyzed languages — AI introduces more vulnerabilities than it fixes.",
    306       "evidence": "Section 6.1 and Fig. 5a show positive AI net impact scores (introduction rate minus fix rate) for all 10 languages analyzed.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "There is no statistically significant difference in severity between AI-introduced and human-introduced vulnerabilities (Mann-Whitney U test, p=0.091).",
    311       "evidence": "Section 6.3 reports the Mann-Whitney U test result with median AI severity 7.00 vs human 7.10.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "AI-introduced vulnerabilities are more concentrated in network attack vectors (86.8%) compared to human-introduced vulnerabilities (80.8%).",
    316       "evidence": "Section 6.3 and Fig. 5d report attack vector distributions.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "CWE-1236 and CWE-916 are the most overrepresented vulnerability types in AI-generated code, with AI file rates of 50.7% and 43.1% respectively.",
    321       "evidence": "Section 6.2 and Fig. 5b present the CWE type distribution analysis.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "Existing AIGC text detectors suffer from precision-recall imbalance when transferred to code detection tasks.",
    326       "evidence": "Section 3.2 and Fig. 3a show that seven baseline models have highly imbalanced precision/recall on the evaluation dataset.",
    327       "supported": "strong"
    328     }
    329   ],
    330   "methodology_tags": [
    331     "observational",
    332     "benchmark-eval"
    333   ],
    334   "key_findings": "This paper presents the first large-scale empirical study of AI-generated code prevalence and security impact across the top 1,000 GitHub repositories (2022-2025) and 7,000+ CVE-linked code changes. Key findings include: AI-generated code concentrates in documentation, tests, and boilerplate rather than core logic and config files; certain CWE families (particularly input validation and cryptographic issues) are overrepresented in AI-tagged code; and AI introduces more vulnerabilities than it fixes across all analyzed languages, though the severity is comparable to human-introduced vulnerabilities. The paper also proposes a Cascade-Aggregation Framework for AI code detection that achieves F1=0.719.",
    335   "red_flags": [
    336     {
    337       "flag": "All findings depend on imperfect detector with only 71.8% accuracy",
    338       "detail": "The entire study's ecosystem and security findings rely on the CAF detector, which has only 71.8% accuracy and 71.6% precision. Roughly 28% of classifications may be wrong, and systematic biases in the detector could propagate into all downstream results (e.g., the security risk profile, language adoption rates). The paper acknowledges this in Section 9 but does not quantify how detection errors affect the security conclusions."
    339     },
    340     {
    341       "flag": "No human validation of detector on in-the-wild data",
    342       "detail": "The detector is evaluated on a synthetic evaluation dataset where AI code is generated from prompts (Section 3.1) and human code is from pre-2010. In-the-wild code from 2022-2025 may have very different characteristics (e.g., human code influenced by AI suggestions, hybrid human-AI editing). No human expert validates the detector's classifications on actual GitHub commits."
    343     },
    344     {
    345       "flag": "Causal claims from correlational data",
    346       "detail": "The paper states AI-generated code 'introduces' vulnerabilities and describes 'AI-induced vulnerabilities propagated by shared models.' These are causal claims drawn from an observational study where code is labeled by an imperfect classifier. The detector may be confounding code style with code provenance."
    347     },
    348     {
    349       "flag": "Potential industry conflict not disclosed",
    350       "detail": "Four authors are affiliated with Tencent, a major tech company that both develops and uses AI code generation tools. No funding source or competing interests statement is provided."
    351     },
    352     {
    353       "flag": "No variance or uncertainty reporting for any results",
    354       "detail": "All detection framework metrics and ecosystem statistics are reported as point estimates without error bars, confidence intervals, or variance across multiple runs. Given the 71.8% accuracy, uncertainty quantification is especially important."
    355     },
    356     {
    357       "flag": "Evaluation dataset has temporal confound",
    358       "detail": "The human code subset comes from 2008-2010 commits while AI code is generated from recent LLMs. The detector may be learning temporal coding style differences (modern vs. older idioms) rather than genuine human/AI differences. When applied to 2022-2025 code, this confound could cause systematic misclassification."
    359     }
    360   ],
    361   "cited_papers": [
    362     {
    363       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    364       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    365       "year": 2023,
    366       "arxiv_id": "2302.06590",
    367       "relevance": "RCT measuring GitHub Copilot's effect on developer productivity, directly relevant to the survey's core question of AI coding tool impact."
    368     },
    369     {
    370       "title": "Do users write more insecure code with ai assistants?",
    371       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    372       "year": 2023,
    373       "relevance": "User study examining whether AI assistants lead to more insecure code, directly relevant to the security dimension of AI-generated code."
    374     },
    375     {
    376       "title": "Security weaknesses of copilot-generated code in github projects: An empirical study",
    377       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir", "Zengyang Li", "Mojtaba Shahin", "Jiaxin Yu", "Jinfu Chen"],
    378       "year": 2025,
    379       "relevance": "Empirical study of security weaknesses in Copilot-generated code found in GitHub projects, directly related to AIGCode security."
    380     },
    381     {
    382       "title": "Evaluating large language models trained on code",
    383       "authors": ["Mark Chen"],
    384       "year": 2021,
    385       "arxiv_id": "2107.03374",
    386       "relevance": "Foundational paper introducing HumanEval and Codex, establishing code generation benchmarks relevant to the survey."
    387     },
    388     {
    389       "title": "Vulnerabilities in ai code generators: Exploring targeted data poisoning attacks",
    390       "authors": ["Domenico Cotroneo", "Cristina Improta", "Pietro Liguori", "Roberto Natella"],
    391       "year": 2024,
    392       "relevance": "Studies targeted data poisoning attacks against AI code generators, relevant to understanding security risks of AI-generated code."
    393     },
    394     {
    395       "title": "Security implications of large language model code assistants: A user study",
    396       "authors": ["Gustavo Sandoval", "Hammond Pearce", "Teo Nys", "Ramesh Karri", "Brendan Dolan-Gavitt", "Siddharth Garg"],
    397       "year": 2022,
    398       "arxiv_id": "2208.09727",
    399       "relevance": "User study on security implications of LLM code assistants, measuring whether AI assistance affects code security in practice."
    400     },
    401     {
    402       "title": "The impact of generative AI on collaborative open-source software development: Evidence from GitHub Copilot",
    403       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    404       "year": 2024,
    405       "arxiv_id": "2410.02091",
    406       "relevance": "Measures impact of generative AI on open-source software development, relevant to understanding AI code adoption."
    407     },
    408     {
    409       "title": "Codexity: secure AI-assisted code generation",
    410       "authors": ["Sung Yong Kim", "Zhiyu Fan", "Yannic Noller", "Abhik Roychoudhury"],
    411       "year": 2024,
    412       "arxiv_id": "2405.03927",
    413       "relevance": "Proposes methods for secure AI-assisted code generation, directly relevant to mitigating security risks of AI-generated code."
    414     },
    415     {
    416       "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques",
    417       "authors": ["Mohammed Latif Siddiq", "Joanna CS Santos"],
    418       "year": 2022,
    419       "relevance": "Provides a benchmark dataset for evaluating code generation security, relevant to methodology of evaluating AI code quality."
    420     },
    421     {
    422       "title": "CodeLMSec benchmark: Systematically evaluating and finding security vulnerabilities in black-box code language models",
    423       "authors": ["Hossein Hajipour", "Keno Hassler", "Thorsten Holz", "Lea Schönherr", "Mario Fritz"],
    424       "year": 2024,
    425       "relevance": "Systematic benchmark for evaluating security vulnerabilities in code language models, relevant to AI code security evaluation."
    426     },
    427     {
    428       "title": "Security Degradation in Iterative AI Code Generation–A Systematic Analysis of the Paradox",
    429       "authors": ["Shivani Shukla", "Himanshu Joshi", "Romilla Syed"],
    430       "year": 2025,
    431       "arxiv_id": "2506.11022",
    432       "relevance": "Analyzes how iterative AI code generation may degrade security over time, directly relevant to the security risk profile of AIGCode."
    433     },
    434     {
    435       "title": "Automatic programming: Large language models and beyond",
    436       "authors": ["Michael R Lyu", "Baishakhi Ray", "Abhik Roychoudhury", "Shin Hwei Tan", "Patanamon Thongtanunam"],
    437       "year": 2025,
    438       "relevance": "Survey of automatic programming with LLMs, providing broader context for AI-assisted code generation research."
    439     }
    440   ]
    441 }

Impressum · Datenschutz