scan.json (30919B)
1 { 2 "paper": { 3 "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation", 4 "authors": [ 5 "Bo Lin", 6 "Shangwen Wang", 7 "Liqian Chen", 8 "Xiaoguang Mao" 9 ], 10 "year": 2025, 11 "venue": "arXiv.org", 12 "arxiv_id": "2502.03233", 13 "doi": "10.48550/arXiv.2502.03233" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "Knowledge base poisoning in RACG systems poses a significant security threat: even a single poisoned code example can compromise up to 48% of generated code (CodeLlama with JINA retriever). Dense retrievers (JINA) are far more susceptible than sparse retrievers (BM25) due to better semantic matching. Code-specialized LLMs are more prone to generating vulnerable code than general-purpose models. Vulnerability propagation risk increases sharply when example-query similarity exceeds 60%, and CWE-352 (CSRF) consistently shows the highest vulnerability rates (~0.8) among top CWE types.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL or code archive is provided. The paper references third-party implementations (BM25 from GitHub, JINA from Huggingface) but does not release its own experimental code or scripts." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The base dataset ReposVul is publicly available, but the authors' constructed dataset—including LLM-generated queries, poisoned knowledge bases, and generated code outputs—is not released." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "§4.6 mentions 'a single A100-40G GPU server using the Ollama framework' with some model parameters, but no requirements.txt, Dockerfile, or detailed dependency list is provided. Not sufficient to recreate the environment." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions, README, or runnable scripts are provided. A researcher would need to reverse-engineer the full pipeline from the paper text." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 4–9 are point estimates (e.g., VR = 0.48). No confidence intervals, error bars, or uncertainty measures are reported anywhere in the paper." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes numerous comparative claims (e.g., '6.5% more vulnerabilities from one-shot to three-shot', 'CodeLlama exhibits the highest susceptibility') without any statistical significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are reported with baseline context throughout. For example, §5.1.1 states 'CodeLlama's VR increases from 0.29 to 0.48' (a 19pp increase), and §5.1.2 reports '6.5% (0.46→0.49)' providing both absolute and relative context." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The dataset contains 12,053 instances from ReposVul. No justification for why this size is sufficient, and no power analysis is discussed. The manual inspection sample sizes (81–95) are justified via confidence level calculation (§6.2)." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance, standard deviation, or spread measures are reported. §4.6 sets temperature to 0 'to reduce non-determinism' but does not report variance across runs or verify determinism." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "All experiments include a zero-poisoning baseline (poisoning number 0 in Table 4, proportion 0 in Table 5), allowing comparison of poisoned vs. unpoisoned performance." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The four LLMs (GPT-4o, Llama-3-8B, CodeLlama-13B, DeepSeek-Coder-V2-16B) are contemporary as of the study period, with models selected from the LLM Safety Leaderboard as of October 2024 (§4.4.1)." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The study systematically varies poisoning quantity (0–9 samples, 0–100% proportion), number of shots (1 vs 3), retriever type (JINA vs BM25), programming language, similarity range, and CWE type—each serving as an ablation of a different factor." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Three metrics are used: Vulnerability Rate (VR), CrystalBLEU similarity, and Vulnerability Rate in Retrieved Code (VRRC), defined in §4.5." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "§6.2 describes manual inspection where 'Two authors independently evaluated the samples through manual review' on 360 generated code samples (95+81+93+91) to validate the LLM judge's accuracy." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "No explicit separation of development and test sets is described. The same dataset appears to be used for all experiments without a held-out split." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive breakdowns are provided: by programming language (Table 7), CWE type (Tables 9, 12), similarity range (Table 8), retriever, scenario, and LLM." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper discusses where poisoning is less effective: BM25 shows smaller VR increases than JINA (§5.1.1), CWE-434 has lowest VR (§5.2.3), and Scenario II is much harder for attackers (§5.1.1)." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Several negative/null results: poisoning has 'negligible' impact on code functionality (similarity metric barely changes), BM25 retriever shows minimal vulnerability increase, and low-similarity examples have 'relatively minor impact on vulnerability likelihood' (§5.2.2)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claim of '48% of the generated code' being vulnerable from a single poisoned sample is supported by Table 4 (CodeLlama + JINA: 0.29→0.48). The 6.5% increase from one-shot to three-shot matches Table 6 aggregated results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The study uses controlled experimental manipulation: systematically varying the number of poisoned examples while holding other factors constant. The causal claim that poisoning 'compromises' security is supported by this controlled design." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "§6.5 explicitly bounds generalization: the four languages 'may not fully represent real-world development scenarios' and account for only '42.7% of the total activity.' Findings are presented per-model and per-retriever rather than as universal claims." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "§6.3 provides an alternative explanation for JINA vs BM25 differences (retrieval effectiveness, validated with MRR/SR@k metrics). §6.5 discusses query generation accuracy (86%) as a potential confound. §5.1.1 discusses inherent LLM vulnerability generation even without poisoning." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "VR is measured by an LLM judge with 77–81% accuracy (§6.2), meaning ~20% of labels are incorrect. The paper does not discuss the gap between LLM-judge-detected vulnerability and actual exploitable security risk. The proxy (LLM judge classification) is treated as equivalent to the outcome (real-world security threat)." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are identified as 'GPT-4o', 'Llama-3-8B', 'CodeLLAMA-13B', 'DeepSeek-Coder-V2-16B'. Parameter counts are given for open-source models, but no snapshot dates or API versions are specified. 'GPT-4o' is a marketing name without a snapshot date." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "Appendix provides prompt templates for query generation (Prompt 1), vulnerability extraction (Prompt 2), and security assessment (Prompt 3), but all contain placeholders ({LANGUAGE}, {FUNCTION}, {DIFF}, etc.). The actual code generation prompt used to instruct LLMs in the RACG pipeline is not provided at all." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "§4.6 reports: 'temperature of 0, top-p value of 0.95, a max_new_tokens setting of 4096, and a context window of 8192, keeping other parameters at default values.'" 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (retriever + LLM generation) without agents, tools, or feedback loops." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "§4.1 documents filtering: functions shorter than three lines and names containing 'test' were removed. §4.2 describes knowledge base construction and poisoning process in detail. Table 2 provides final statistics (12,053 instances, 236 CWEs)." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "§6.5 'Threats to Validity' is a dedicated subsection with substantive discussion of query generation accuracy, programming language coverage, and their implications." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "§6.5 discusses study-specific threats: DeepSeek-V2.5 may produce inaccurate queries (mitigated by manual review showing 86% accuracy), and the four programming languages account for only 42.7% of GitHub activity." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "While §6.5 notes that 'the selected languages may not fully represent real-world development scenarios,' the paper does not explicitly state what the results do NOT show—e.g., no statement about applicability to different retriever types, real-world deployment settings, or non-function-level poisoning." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data (generated code, judge outputs, poisoned knowledge bases) is made available. Only aggregated results in tables are presented." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "§4.1 describes dataset selection in detail: 12 candidate datasets evaluated against 4 criteria (Table 1), ReposVul selected as it satisfies all requirements. Filtering criteria and resulting statistics (Table 2) are documented." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data comes from the ReposVul dataset, a standard publicly available vulnerability dataset." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The full pipeline is documented: ReposVul → filtering (§4.1) → query generation (§4.1) → knowledge base construction (§4.2) → poisoning (§4.2.1/§4.2.2) → code generation → result validation (§4.3). Each stage has explicit criteria." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source or acknowledgments section is present in the paper text. Whether the work is funded or unfunded is unknown." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All four authors are listed as affiliated with the National University of Defense Technology. They evaluate third-party LLMs (GPT-4o, Llama-3, CodeLlama, DeepSeek-Coder), not their own products." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence of funding from outcomes cannot be assessed." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for any of the four LLMs used. This matters because ReposVul vulnerabilities may be in GPT-4o's or Llama-3's training data." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether ReposVul code examples appeared in any LLM's training data. The models may have memorized the secure or vulnerable code patterns, confounding the results." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "ReposVul is built from public repositories available before model training cutoffs. No discussion of whether models have seen these code snippets during training, which could affect both VR baseline and poisoning effectiveness." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study. Manual validation of the LLM judge by the authors is not a human subjects study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study analyzes code generation and vulnerability propagation, not human behavior." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference costs, API costs, or per-example latency are reported despite using the GPT-4o API and running 16 sub-scenarios across 12,053 instances." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Hardware is mentioned ('A100-40G GPU server') but no total compute budget (GPU hours, API spend, wall-clock time for experiments) is reported." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Temperature is set to 0 'to reduce non-determinism' (§4.6) but no multi-seed or multi-run experiments are conducted. Determinism is assumed, not verified." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs is never explicitly stated. Results appear to be from a single run per configuration." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "Hyperparameters (temperature=0, top-p=0.95, etc.) appear to be fixed choices with no search or justification for these values beyond 'keeping other parameters at default values.'" 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "The paper reports results across all configurations rather than selecting a best one. For RQ2 analysis, moderate poisoning quantities are chosen with justification: 'LLM-generated code exhibits similar patterns across all metrics, regardless of poisoning quantity' (§5.2)." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper makes many comparisons across 4 LLMs × 2 retrievers × 2 scenarios × multiple poisoning levels without any multiple comparison correction or even significance testing." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": true, 326 "justification": "The authors evaluate third-party LLMs (GPT-4o, Llama-3, CodeLlama, DeepSeek-Coder) rather than their own system. The poisoning methodology is the contribution, and they report results across all models without cherry-picking favorable configurations." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "Models of vastly different sizes are compared (8B vs GPT-4o) without discussing compute differences. No performance-per-compute analysis is provided." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The VR metric relies on an LLM judge with 77–81% accuracy. The paper validates the judge (§6.2) but does not discuss whether detecting vulnerability patterns via LLM actually measures real-world security risk—e.g., whether flagged vulnerabilities are exploitable." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No agentic scaffolding is used. The RAG pipeline is simple retrieval + generation without scaffolding." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal leakage. ReposVul contains vulnerabilities from public repositories that predate the LLMs' training data, meaning models may have already seen these code patterns." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether providing vulnerable code as in-context examples leaks information differently than real-world RAG settings (e.g., whether format/structure of injected examples is distinguishable)." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether training data and test examples share structural similarities (same repositories, same authors, duplicate patterns) that could inflate results." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention methods (canary strings, membership inference, decontamination) are applied." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Even a single poisoned code example can compromise up to 48% of generated code when programming intent is exposed.", 370 "evidence": "Table 4 shows CodeLlama with JINA retriever: VR increases from 0.29 (baseline) to 0.48 with 1 poisoned sample in Scenario I (§5.1.1).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Without access to user intent, injecting vulnerability code equivalent to 20% of the knowledge base can lead to approximately 36% of generated code being vulnerable.", 375 "evidence": "Table 5 shows CodeLlama with JINA at 0.2 poisoning proportion: VR = 0.36 in Scenario II (§5.1.1).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "More few-shot examples increase vulnerability rates: LLMs generated 6.5% more vulnerabilities from one-shot to three-shot with JINA retriever.", 380 "evidence": "Table 6 shows aggregated VR ('All' column) increasing from 0.46 to 0.49 (one-shot to three-shot) in JINA-I setting (§5.1.2).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Code LLMs are more prone to generating vulnerable code than general-purpose LLMs.", 385 "evidence": "Table 4 shows CodeLlama consistently has the highest VR across all configurations, followed by DS-Coder. Llama-3 (general-purpose) shows lower VR (§5.1.1).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "C++ consistently exhibits the highest vulnerability rate across most sub-scenarios.", 390 "evidence": "Table 7 shows C++ averaging 0.47 (JINA-I), 0.42 (BM25-I), 0.46 (JINA-II), 0.44 (BM25-II) compared to lower rates for Java and Python (§5.2.1).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Example-query similarity above 60% significantly increases vulnerability risk.", 395 "evidence": "Table 8 shows VR jumps from 0.35 at [40,60) to 0.42 at [60,80) and 0.53 at [80,100] in Scenario I (§5.2.2).", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "CWE-352 (Cross-Site Request Forgery) consistently shows the highest vulnerability rates (~0.8) among MITRE Top-10.", 400 "evidence": "Table 9 shows CWE-352 averaging 0.79 in Scenario I and 0.78 in Scenario II, far above other CWE types (§5.2.3).", 401 "supported": "strong" 402 }, 403 { 404 "claim": "Dense retrievers (JINA) introduce significantly more vulnerabilities than sparse retrievers (BM25).", 405 "evidence": "Tables 4-6 consistently show higher VR with JINA vs BM25. Table 11 confirms JINA's superior retrieval capability (MRR 0.85 vs 0.20) which amplifies vulnerability propagation (§6.3).", 406 "supported": "strong" 407 } 408 ], 409 "red_flags": [ 410 { 411 "flag": "LLM-as-judge with significant error rate", 412 "detail": "The primary evaluation metric (VR) relies on an LLM judge (DeepSeek-V2.5) with only 77–81% accuracy across languages (§6.2). This means approximately 1 in 5 vulnerability labels may be incorrect, but no uncertainty propagation from this error rate to the final results is performed." 413 }, 414 { 415 "flag": "No statistical significance testing", 416 "detail": "Numerous comparative claims are made (e.g., '6.5% more vulnerabilities', 'CodeLlama exhibits the highest susceptibility') based solely on comparing point estimates without any significance tests, confidence intervals, or error bars." 417 }, 418 { 419 "flag": "Training data contamination unaddressed", 420 "detail": "ReposVul contains code from public repositories that likely appeared in the training data of GPT-4o, Llama-3, and CodeLlama. The baseline VR (0.18–0.29 without poisoning) could partly reflect memorized vulnerability patterns rather than in-context influence." 421 }, 422 { 423 "flag": "Single-run results without variance reporting", 424 "detail": "All results appear to be from single experimental runs. Temperature=0 reduces but does not eliminate non-determinism, and LLM outputs can vary across API calls even at temperature 0. No variance or stability analysis is provided." 425 }, 426 { 427 "flag": "LLM-generated queries introduce noise", 428 "detail": "§6.5 reports that LLM-generated queries achieve only 86% accuracy based on manual review of 100 samples per language. This means ~14% of queries may not accurately represent the code's functionality, yet this error is not propagated into the analysis." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 434 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 435 "year": 2022, 436 "relevance": "Foundational study assessing security of LLM-generated code (Copilot), directly relevant to understanding AI code generation security risks." 437 }, 438 { 439 "title": "How secure is code generated by ChatGPT?", 440 "authors": ["Raphaël Khoury", "Anderson R Avila", "Jacob Brunelle", "Baba Mamadou Camara"], 441 "year": 2023, 442 "relevance": "Evaluates security of ChatGPT-generated code, establishing baseline vulnerability rates in LLM code generation." 443 }, 444 { 445 "title": "How secure is AI-generated code: a large-scale comparison of large language models", 446 "authors": ["Norbert Tihanyi", "Tamas Bisztray", "Mohamed Amine Ferrag", "Ridhi Jain", "Lucas C Cordeiro"], 447 "year": 2025, 448 "relevance": "Large-scale comparison of LLM code security, providing context for vulnerability rates across different models." 449 }, 450 { 451 "title": "PoisonedRAG: Knowledge poisoning attacks to retrieval-augmented generation of large language models", 452 "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"], 453 "year": 2024, 454 "arxiv_id": "2402.07867", 455 "relevance": "Proposes knowledge base poisoning attacks on RAG systems for general LLM tasks; this paper extends the concept to code generation security." 456 }, 457 { 458 "title": "Poisoning web-scale training datasets is practical", 459 "authors": ["Nicholas Carlini", "Matthew Jagielski", "Christopher A Choquette-Choo"], 460 "year": 2024, 461 "relevance": "Demonstrates practical feasibility of poisoning public data sources, supporting the threat model of RACG knowledge base poisoning." 462 }, 463 { 464 "title": "Code Llama: Open foundation models for code", 465 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 466 "year": 2023, 467 "arxiv_id": "2308.12950", 468 "relevance": "Describes CodeLlama, a code-specialized LLM used as a primary subject in the study and shown to be most susceptible to poisoning." 469 }, 470 { 471 "title": "Retrieval augmented code generation and summarization", 472 "authors": ["Md Rizwan Parvez", "Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"], 473 "year": 2021, 474 "arxiv_id": "2108.11601", 475 "relevance": "Early work on retrieval-augmented code generation, establishing the RACG paradigm that this paper's security analysis targets." 476 }, 477 { 478 "title": "Using AI assistants in software development: A qualitative study on security practices and concerns", 479 "authors": ["Jan H Klemmer", "Stefan Albert Horstmann", "Nikhil Patnaik"], 480 "year": 2024, 481 "relevance": "Qualitative study on developer security practices with AI assistants, complementary to this paper's quantitative vulnerability analysis." 482 }, 483 { 484 "title": "RMCBench: Benchmarking large language models' resistance to malicious code", 485 "authors": ["Jiachi Chen", "Qingyuan Zhong", "Yanlin Wang"], 486 "year": 2024, 487 "relevance": "Benchmarks LLMs' ability to resist generating malicious code, directly relevant to understanding model susceptibility to poisoned examples." 488 }, 489 { 490 "title": "CodeRAG-Bench: Can retrieval augment code generation?", 491 "authors": ["Zora Zhiruo Wang", "Akari Asai", "Xinyan Velocity Yu"], 492 "year": 2024, 493 "arxiv_id": "2406.14497", 494 "relevance": "Benchmark for retrieval-augmented code generation, establishing evaluation methodology for RACG systems." 495 }, 496 { 497 "title": "ReposVul: A repository-level high-quality vulnerability dataset", 498 "authors": ["Xinchen Wang", "Ruida Hu", "Cuiyun Gao"], 499 "year": 2024, 500 "relevance": "The primary dataset used in this study, providing paired vulnerable and secure code versions across multiple languages." 501 }, 502 { 503 "title": "Vul-RAG: Enhancing LLM-based vulnerability detection via knowledge-level RAG", 504 "authors": ["Xueying Du", "Geng Zheng", "Kaixin Wang"], 505 "year": 2024, 506 "arxiv_id": "2406.11147", 507 "relevance": "Uses RAG for vulnerability detection with an extraction-detection pipeline similar to this paper's LLM judge approach." 508 } 509 ], 510 "engagement_factors": { 511 "practical_relevance": { 512 "score": 2, 513 "justification": "Provides actionable insights for anyone building RACG systems (e.g., prefer second-most-similar retrieval, monitor for specific CWEs), but no tool or defense is released." 514 }, 515 "surprise_contrarian": { 516 "score": 1, 517 "justification": "The general concept that poisoned inputs lead to poisoned outputs is intuitive; the specific magnitudes (48% from a single sample) are notable but not paradigm-shifting." 518 }, 519 "fear_safety": { 520 "score": 3, 521 "justification": "Demonstrates a concrete, novel attack vector on RACG code generation systems where a single malicious code example can compromise nearly half of generated code." 522 }, 523 "drama_conflict": { 524 "score": 1, 525 "justification": "Raises concerns about RAG security but does not directly call out any product or company, and findings are presented in a measured academic tone." 526 }, 527 "demo_ability": { 528 "score": 0, 529 "justification": "No code, demo, or tools are released. The attack and evaluation pipeline are described but not made available." 530 }, 531 "brand_recognition": { 532 "score": 1, 533 "justification": "Evaluates GPT-4o (OpenAI) and other known models, but authors are from NUDT and the paper itself is not from a high-profile lab." 534 } 535 } 536 }