ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32295B)


      1 {
      2   "paper": {
      3     "title": "Give LLMs a Security Course: Securing Retrieval-Augmented Code Generation via Knowledge Injection",
      4     "authors": ["Bo Lin", "Shangwen Wang", "Yihao Qin", "Liqian Chen", "Xiaoguang Mao"],
      5     "year": 2025,
      6     "venue": "Conference on Computer and Communications Security",
      7     "arxiv_id": "2504.16429",
      8     "doi": "10.1145/3719027.3765049"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "CodeGuarder, a security-hardening framework for RACG systems, injects security knowledge (root causes, fixing patterns) derived from CVE databases into code generation prompts. On the CyberSecEval benchmark across 4 LLMs and 4 languages, it improves security rates by 20.12% in standard RACG, 31.53% and 21.91% under two poisoning scenarios, without degrading functional correctness (slightly improves CodeBLEU and pass@1). It also outperforms SafeCoder by 9.80% in non-retrieval settings and demonstrates cross-language generalization even without target-language-specific security knowledge.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. No mention of releasing CodeGuarder's implementation."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation benchmark CyberSecEval is publicly available, and the security knowledge base source data ReposVul is also public. However, the constructed security knowledge base S itself is not explicitly released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section 5.6 mentions 'A100 GPU server using the Ollama' and basic model configurations (temperature=0, max_new_tokens=4096, context window=8192), but no requirements.txt, Dockerfile, or detailed dependency list is provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or experiment scripts are provided or referenced."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All main results in Tables 4-9 are reported as point estimates without confidence intervals or error bars. The only variance data (0.38% max deviation across 5 runs) appears in the threats to validity section for a single model."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used. Claims that CodeGuarder 'improves' or 'outperforms' baselines are based solely on comparing raw numbers without any hypothesis testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements are reported with baseline context throughout (e.g., '20.12% improvement' from 60.84 to 73.08 in Table 4). Relative and absolute improvements are provided for all comparisons."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The choice of CyberSecEval (1,916 instances) as benchmark is justified by its comprehensiveness but no power analysis or statistical justification for sample size adequacy is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Main results are single-run. Section 7.5 reports 5 runs of DS-V3 with max 0.38% SR deviation, but this is only for one model under one scenario and appears only in the threats to validity discussion, not in main results tables."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "RQ1-RQ2 compare with/without CodeGuarder. RQ3 compares against three state-of-the-art baselines: Sven, SafeCoder, and CoSec (Table 7)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines are recent: Sven (CCS 2023), SafeCoder (2024), CoSec (ISSTA 2024). These represent the state of the art in secure code generation hardening."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 7.3 presents an ablation study with two variants: CodeGuarder-QD (without query decomposition) and CodeGuarder-KRF (without knowledge re-ranking and filtering). Results in Table 11 show QD contributes ~10.1% SR improvement."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Security Rate (SR) and CodeBLEU Similarity (Sim) are used throughout. Additionally, Pass@1 and Pass@5 on MBPP and HumanEval are reported in Table 9 for functional correctness."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All security evaluation is automated via CyberSecEval's Insecure Code Detector. No human evaluation of code security or functionality is performed."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The security knowledge base S is constructed from ReposVul while evaluation is on CyberSecEval, ensuring separation. Section 5.2 explicitly notes: 'Using the same dataset for both the vulnerable code and the security knowledge base could lead to overlapping fix strategies.'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by programming language (C, C++, Java, Python in Tables 4-6), by LLM (4 models), by CWE type (Table 10, MITRE Top-25), and by scenario (standard, poisoning I, poisoning II)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 7.2 discusses failure cases: CWE-79 (XSS) has only 4.76% prevention rate due to lack of JavaScript-specific knowledge. CWE-22 (path traversal) at 8.26% is also highlighted as challenging."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative findings are reported: CWE-79 prevention is near zero (4.76%), smaller models benefit less from CodeGuarder than larger models, and excessive knowledge injection degrades performance (Table 12 shows degradation at high k' and k values)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 20.12% improvement in standard RACG, 31.53% and 21.91% under poisoning scenarios are directly supported by average SR values in Tables 4, 5, and 6 respectively. The generalization claim of 75.54% SR across four languages matches Table 7."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claim that CodeGuarder improves security is supported by controlled comparisons: same LLM, same benchmark, same scenario, with the only variable being CodeGuarder's security knowledge injection. The ablation study (Section 7.3) isolates component contributions through single-variable manipulation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract and conclusion claim CodeGuarder is 'a pivotal advancement towards building secure and trustworthy RACG systems' and 'the first security-hardening framework for RACG systems.' These broad claims extend beyond the tested setting of CyberSecEval with 4 LLMs. The title frames results as 'Securing Retrieval-Augmented Code Generation' generally, not bounded to the specific benchmark and models tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The threats to validity section addresses measurement validity and non-determinism but does not consider alternative explanations for the improvements, such as whether longer prompts (rather than security content specifically) might account for some gains, or whether the improvements reflect prompt engineering rather than security knowledge per se."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 7.5 explicitly acknowledges that their security measurement 'may not perfectly capture the true security posture' and that 50 CWEs 'may not represent the complete universe of potential security flaws.' They also note Sim is a proxy for functionality and supplement with test-based evaluation (MBPP/HumanEval) in Section 7.1."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are listed as 'GPT-4o', 'DeepSeek-V3', 'CodeLlama-13B', and 'DeepSeek-Coder-V2-16B' without specific version strings or API snapshot dates. 'GPT-4o' without a snapshot date does not count as a specified version per the schema criteria."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Three prompt templates are provided in the Appendix: Prompt 1 (security knowledge extraction), Prompt 2 (query decomposition), and Prompt 3 (security-augmented code generation with a concrete worked example). While Prompts 1-2 use placeholders, Prompt 3 includes actual filled content showing the exact format sent to the model."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.6 reports temperature=0, max_new_tokens=4096, context window=8192, and notes other parameters at defaults. Section 7.4 reports k'=2 and k=5 as retrieval hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-stage pipeline is described in detail: offline knowledge base construction (Section 4.1), query decomposition (Section 4.2.1), similarity-based retrieval (Section 4.2.2), re-ranking and filtering (Section 4.2.3), and security-augmented generation (Section 4.3). Figure 2 provides the workflow diagram."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.2 describes construction of all three knowledge bases: S from ReposVul function-level vulnerability pairs, K from fixed code in ReposVul, V from CyberSecEval vulnerable code. Table 2 provides statistics. Section 4.1 details the LLM-based extraction pipeline for security knowledge."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.5 'Threats to Validity' provides substantive discussion of two specific threats: validity of security measurement and reliability/LLM non-determinism."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.5 discusses study-specific threats: (1) CyberSecEval's Insecure Code Detector covers only 50 CWEs and may miss vulnerabilities, (2) LLM non-determinism quantified via 5 independent DS-V3 runs showing 0.38% max deviation. Both are specific to this study's design."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. There are no statements like 'our results do not demonstrate protection against novel attack types' or 'we do not claim security against runtime vulnerabilities.' The threats section discusses measurement limitations but not explicit scope boundaries on claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (generated code, per-instance security verdicts, retrieval results) is made available. Only aggregated results in tables are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.2 describes the construction of all three knowledge bases with sources, processing steps, and statistics (Table 2). ReposVul provides 12,053 function-level vulnerability pairs; CyberSecEval provides 1,916 instances across 50 CWE types."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks (CyberSecEval) and datasets (ReposVul)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented: CVE instances → LLM-based extraction of functionality/root cause/fixing pattern → security knowledge base S (Section 4.1). The knowledge base statistics are provided in Table 2 (e.g., 8,861 C security knowledge entries vs 12,053 source vulnerabilities, with explanation that some vulnerabilities have multiple root causes)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The Acknowledgments section is present but contains no funding information. No grants, sponsors, or funding agencies are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed as affiliated with College of Computer Science, National University of Defense Technology, Changsha, China, and the State Key Laboratory of Complex & Critical Software Environment."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, making it impossible to assess funder independence. The authors are from a national defense university, which could have institutional interests in security tools."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the four LLMs evaluated (GPT-4o, DeepSeek-V3, CodeLlama-13B, DeepSeek-Coder-V2-16B). This is relevant since CyberSecEval benchmarks could be in their training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether CyberSecEval test instances appeared in the training data of the evaluated LLMs. CyberSecEval was published in 2023; all models were trained after this date."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "CyberSecEval was published in 2023 (arXiv:2312.04724). Models like GPT-4o and DeepSeek-V3 were trained after this date and may have seen the benchmark. This contamination risk is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. All evaluation is automated."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or API costs reported. CodeGuarder adds multiple LLM calls (query decomposition, knowledge retrieval, re-ranking) on top of standard RACG, but the cost overhead is never quantified."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 5.6 mentions 'A100 GPU server' but provides no total GPU hours, API spend, or training time for reproducing SafeCoder/Sven baselines or running the full evaluation."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Section 7.5 reports 5 runs for DS-V3 only under the standard scenario, showing 0.38% max deviation. This covers only one of four models and one of three scenarios, and is relegated to threats to validity rather than main results."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Main results appear to be single-run. The 5-run analysis is mentioned only for DS-V3 in the threats to validity section, not as the basis for main experimental results."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 7.4 and Table 12 transparently show the hyperparameter grid search over k' ∈ {1,2,3} and k ∈ {3,5,7,9} for two models (DS-V3 and CodeLlama-13B), with all configurations' SR values reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 7.4 explains the selection of k'=2, k=5: 'aiming for robust performance across different model scales while mitigating degradation.' All tested configurations are shown in Table 12."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes numerous comparisons across 4 LLMs, 4+ languages, and 3 scenarios without any statistical tests or multiple comparison corrections."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement their own system and compare against baselines. For SafeCoder they use the authors' trained model, and for Sven/CoSec they follow official implementations. However, they do not explicitly acknowledge or discuss the bias of evaluating their own system."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "CodeGuarder adds overhead of query decomposition, retrieval, and re-ranking/filtering on top of standard RACG. This additional compute cost is never compared against the performance gains. Baselines (Sven, SafeCoder, CoSec) have different computational profiles not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 5.2 justifies CyberSecEval's construct validity: 1,916 instances across 50 CWE types, 96% detection precision. Section 7.5 acknowledges '50 CWEs... may not represent the complete universe of potential security flaws.' The paper contrasts CyberSecEval with alternatives (LLMSecEval: only 150 instances, 18 CWEs)."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "All model comparisons use the same RACG pipeline and CodeGuarder framework, controlling for scaffolding differences. The with/without CodeGuarder comparison isolates the security knowledge injection as the only variable."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "CyberSecEval was published in 2023 and all evaluated models were trained or updated after this date. The possibility that models saw CyberSecEval instances during training is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. The security knowledge injected into prompts is derived from vulnerability databases that may overlap with model training data."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "Section 5.2 explicitly separates data sources: security knowledge from ReposVul, vulnerable code from CyberSecEval, stating: 'Using the same dataset for both the vulnerable code and the security knowledge base could lead to overlapping fix strategies, making poisoning attempts easier to detect.'"
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis) are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "CodeGuarder improves security rate by 20.12% on average in standard RACG scenarios across 4 LLMs and 4 languages.",
    365       "evidence": "Table 4 shows average SR increases from 60.84 to 73.08 across GPT-4o, DS-V3, CodeLlama, and DS-Coder on C, C++, Java, and Python (Section 6.2).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "CodeGuarder improves security rate by 31.53% under targeted poisoning (Scenario I) where attacker has access to programming intents.",
    370       "evidence": "Table 5 shows average SR increases from 50.57 to 66.52 across all LLMs and languages (Section 6.3.1).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "CodeGuarder improves security rate by 21.91% under intent-agnostic poisoning (Scenario II).",
    375       "evidence": "Table 6 shows average SR increases from 60.05 to 73.20 across all LLMs and languages (Section 6.3.2).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "CodeGuarder does not compromise functional correctness of generated code.",
    380       "evidence": "Tables 4-6 show Sim metric slightly improves with CodeGuarder. Table 9 shows slight improvements on MBPP and HumanEval (e.g., GPT-4o Pass@1 on MBPP: 72.8 → 73.2) (Section 7.1).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "CodeGuarder outperforms state-of-the-art SafeCoder by 9.80% in security rate in non-retrieval scenarios.",
    385       "evidence": "Table 7 shows CodeGuarder achieves 79.40% average SR vs SafeCoder's 72.31% on Mistral-7B across C, C++, Python (Section 6.4.1).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "CodeGuarder generalizes to languages without target-specific security knowledge, improving security by 15.69% in standard scenarios.",
    390       "evidence": "Table 8 shows average SR improvements from 55.36% to 64.04% across C#, JavaScript, PHP, and Rust when no language-specific knowledge is available (Section 6.4.2).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Query decomposition is the most impactful component, contributing ~10.1% SR improvement.",
    395       "evidence": "Table 11 ablation study: removing QD drops SR from 76.36% to 68.61% in standard scenario; removing only KRF drops to 74.13% (Section 7.3).",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No code or artifact release",
    402       "detail": "Despite proposing a complete framework (CodeGuarder) with multiple components (knowledge base, retriever, re-ranker), no code repository, constructed knowledge base, or experimental artifacts are released. This prevents independent verification and reproduction."
    403     },
    404     {
    405       "flag": "No statistical significance testing",
    406       "detail": "All comparative claims are based on raw number comparisons without any statistical tests, confidence intervals, or error bars on main results. The 5-run analysis covers only one model under one scenario."
    407     },
    408     {
    409       "flag": "Same research group evaluates their own attack and defense",
    410       "detail": "Reference [24] (Lin et al. 2025, 'Exploring the Security Threats of Knowledge Base Poisoning in RACG') shares authors with this paper. The same group defined the poisoning attack scenarios and now proposes the defense, potentially biasing both the threat model and evaluation design."
    411     },
    412     {
    413       "flag": "Benchmark contamination unaddressed",
    414       "detail": "CyberSecEval was published in 2023. GPT-4o and DeepSeek-V3, trained after 2023, may have seen benchmark instances during training. Baseline security rates could be inflated or deflated by contamination, affecting the measured improvement from CodeGuarder."
    415     },
    416     {
    417       "flag": "Baseline comparison limited to small models only",
    418       "detail": "The comparison with prior methods (Sven, SafeCoder, CoSec) in Table 7 is limited to Mistral-7B and CodeLlama-7B, not the larger models (GPT-4o, DS-V3) used in main experiments. This is justified by training costs but limits the strength of the state-of-the-art comparison."
    419     },
    420     {
    421       "flag": "Cost overhead not quantified",
    422       "detail": "CodeGuarder adds query decomposition (LLM call), retrieval, re-ranking, and prompt augmentation on top of standard RACG. The computational overhead is never measured, making it impossible to assess practical deployability."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    428       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    429       "year": 2022,
    430       "relevance": "Foundational study evaluating security of LLM-generated code, finding ~40% of Copilot code contains vulnerabilities."
    431     },
    432     {
    433       "title": "Large language models for code: Security hardening and adversarial testing",
    434       "authors": ["Jingxuan He", "Martin Vechev"],
    435       "year": 2023,
    436       "relevance": "Proposes SVEN, a prefix-tuning approach for steering LLM code generation toward security properties."
    437     },
    438     {
    439       "title": "Instruction tuning for secure code generation",
    440       "authors": ["Jingxuan He", "Mark Vero", "Gabriela Krasnopolska", "Martin Vechev"],
    441       "year": 2024,
    442       "arxiv_id": "2402.09497",
    443       "relevance": "SafeCoder uses instruction tuning to train LLMs for secure code generation, the primary baseline for CodeGuarder."
    444     },
    445     {
    446       "title": "CoSec: On-the-Fly Security Hardening of Code LLMs via Supervised Co-decoding",
    447       "authors": ["Dong Li", "Meng Yan", "Yaosheng Zhang"],
    448       "year": 2024,
    449       "relevance": "Proposes co-decoding for security hardening without weight modification, a key baseline for non-retrieval comparison."
    450     },
    451     {
    452       "title": "Purple LLaMA CyberSecEval: A secure coding benchmark for language models",
    453       "authors": ["Manish Bhatt", "Sahana Chennabasappa", "Cyrus Nikolaidis"],
    454       "year": 2023,
    455       "arxiv_id": "2312.04724",
    456       "relevance": "Primary evaluation benchmark with 1,916 instances across 50 CWE types for assessing LLM code security."
    457     },
    458     {
    459       "title": "CyberSecEval 3: Advancing the evaluation of cybersecurity risks and capabilities in large language models",
    460       "authors": ["Shengye Wan", "Cyrus Nikolaidis", "Daniel Song"],
    461       "year": 2024,
    462       "arxiv_id": "2408.01605",
    463       "relevance": "Extended version of CyberSecEval benchmark used for security evaluation in this study."
    464     },
    465     {
    466       "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation",
    467       "authors": ["Bo Lin", "Shangwen Wang", "Liqian Chen", "Xiaoguang Mao"],
    468       "year": 2025,
    469       "arxiv_id": "2502.03233",
    470       "relevance": "Companion paper by the same group that defines the RACG poisoning attack scenarios used in this evaluation."
    471     },
    472     {
    473       "title": "How secure is AI-generated code: a large-scale comparison of large language models",
    474       "authors": ["Norbert Tihanyi", "Tamas Bisztray", "Mohamed Amine Ferrag"],
    475       "year": 2025,
    476       "relevance": "Large-scale analysis of LLM code security across 13 models providing the vulnerability distribution used for CodeGuarder's re-ranking weights."
    477     },
    478     {
    479       "title": "Evaluating large language models trained on code",
    480       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    481       "year": 2021,
    482       "arxiv_id": "2107.03374",
    483       "relevance": "Introduces HumanEval benchmark used to evaluate CodeGuarder's impact on functional correctness."
    484     },
    485     {
    486       "title": "Program synthesis with large language models",
    487       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    488       "year": 2021,
    489       "arxiv_id": "2108.07732",
    490       "relevance": "Introduces MBPP benchmark used to evaluate CodeGuarder's impact on functional correctness."
    491     },
    492     {
    493       "title": "Is your AI-generated code really safe? Evaluating large language models on secure code generation with CodeSecEval",
    494       "authors": ["Jiexin Wang", "Xitong Luo", "Liuwen Cao"],
    495       "year": 2024,
    496       "arxiv_id": "2407.02395",
    497       "relevance": "Evaluates security of LLM-generated code, corroborating persistent security vulnerabilities in generated code."
    498     },
    499     {
    500       "title": "Using AI Assistants in Software Development: A Qualitative Study on Security Practices and Concerns",
    501       "authors": ["Jan H Klemmer", "Stefan Albert Horstmann", "Nikhil Patnaik"],
    502       "year": 2024,
    503       "relevance": "Qualitative study on security practices with AI coding assistants, highlighting real-world security concerns in AI-assisted development."
    504     },
    505     {
    506       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    507       "authors": ["Xinchen Wang", "Ruida Hu", "Cuiyun Gao"],
    508       "year": 2024,
    509       "relevance": "Provides the 12,053 function-level vulnerability pairs used to construct CodeGuarder's security knowledge base."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 2,
    515       "justification": "Security hardening for RAG-based code generation is directly relevant to practitioners, but the framework is not released as a usable tool."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "The finding that injecting security knowledge helps is intuitive; the poisoning vulnerability of RACG was already established in the authors' prior work."
    520     },
    521     "fear_safety": {
    522       "score": 2,
    523       "justification": "Demonstrates that poisoned knowledge bases can cause nearly half of generated code to be vulnerable, raising practical security concerns for RAG-based coding tools."
    524     },
    525     "drama_conflict": {
    526       "score": 0,
    527       "justification": "No controversial claims or challenges to established players; presents a constructive defense framework."
    528     },
    529     "demo_ability": {
    530       "score": 0,
    531       "justification": "No code, demo, or installable tool is released."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "Evaluates GPT-4o and DeepSeek-V3 which are recognizable, but the authors' institution (NUDT) is not widely known in the AI popular press."
    536     }
    537   }
    538 }

Impressum · Datenschutz