ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (32475B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation",
      6     "authors": [
      7       "Bo Lin",
      8       "Shangwen Wang",
      9       "Liqian Chen",
     10       "Xiaoguang Mao"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2502.03233",
     15     "doi": "10.48550/arXiv.2502.03233"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claim of '48% of the generated code' being vulnerable from a single poisoned sample is supported by Table 4 (CodeLlama + JINA: 0.29→0.48). The 6.5% increase from one-shot to three-shot matches Table 6 aggregated results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The study uses controlled experimental manipulation: systematically varying the number of poisoned examples while holding other factors constant. The causal claim that poisoning 'compromises' security is supported by this controlled design.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "§6.5 explicitly bounds generalization: the four languages 'may not fully represent real-world development scenarios' and account for only '42.7% of the total activity.' Findings are presented per-model and per-retriever rather than as universal claims.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "§6.3 provides an alternative explanation for JINA vs BM25 differences (retrieval effectiveness, validated with MRR/SR@k metrics). §6.5 discusses query generation accuracy (86%) as a potential confound. §5.1.1 discusses inherent LLM vulnerability generation even without poisoning.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "VR is measured by an LLM judge with 77–81% accuracy (§6.2), meaning ~20% of labels are incorrect. The paper does not discuss the gap between LLM-judge-detected vulnerability and actual exploitable security risk. The proxy (LLM judge classification) is treated as equivalent to the outcome (real-world security threat).",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "§6.5 'Threats to Validity' is a dedicated subsection with substantive discussion of query generation accuracy, programming language coverage, and their implications.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "§6.5 discusses study-specific threats: DeepSeek-V2.5 may produce inaccurate queries (mitigated by manual review showing 86% accuracy), and the four programming languages account for only 42.7% of GitHub activity.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "While §6.5 notes that 'the selected languages may not fully represent real-world development scenarios,' the paper does not explicitly state what the results do NOT show—e.g., no statement about applicability to different retriever types, real-world deployment settings, or non-function-level poisoning.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source or acknowledgments section is present in the paper text. Whether the work is funded or unfunded is unknown.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed as affiliated with the National University of Defense Technology. They evaluate third-party LLMs (GPT-4o, Llama-3, CodeLlama, DeepSeek-Coder), not their own products.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence of funding from outcomes cannot be assessed.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "RACG, the threat model (attacker goals, capabilities, two scenarios), and all three metrics (VR, Similarity via CrystalBLEU, VRRC) are defined precisely in Sections 3 and 4.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three explicit contributions are bullet-pointed in Section 1: first comprehensive RACG security study, large-scale 16 sub-scenario experimentation, and practical factor-analysis insights.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 covers LLMs, RAG, RACG, and existing attacks; Section 6.4 explicitly distinguishes this work from PoisonedRAG by contrasting functional vs. security poisoning goals.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No repository URL or code archive is provided. The paper references third-party implementations (BM25 from GitHub, JINA from Huggingface) but does not release its own experimental code or scripts.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The base dataset ReposVul is publicly available, but the authors' constructed dataset—including LLM-generated queries, poisoned knowledge bases, and generated code outputs—is not released.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "§4.6 mentions 'a single A100-40G GPU server using the Ollama framework' with some model parameters, but no requirements.txt, Dockerfile, or detailed dependency list is provided. Not sufficient to recreate the environment.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No reproduction instructions, README, or runnable scripts are provided. A researcher would need to reverse-engineer the full pipeline from the paper text.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 4–9 are point estimates (e.g., VR = 0.48). No confidence intervals, error bars, or uncertainty measures are reported anywhere in the paper.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper makes numerous comparative claims (e.g., '6.5% more vulnerabilities from one-shot to three-shot', 'CodeLlama exhibits the highest susceptibility') without any statistical significance tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are reported with baseline context throughout. For example, §5.1.1 states 'CodeLlama's VR increases from 0.29 to 0.48' (a 19pp increase), and §5.1.2 reports '6.5% (0.46→0.49)' providing both absolute and relative context.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The dataset contains 12,053 instances from ReposVul. No justification for why this size is sufficient, and no power analysis is discussed. The manual inspection sample sizes (81–95) are justified via confidence level calculation (§6.2).",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or spread measures are reported. §4.6 sets temperature to 0 'to reduce non-determinism' but does not report variance across runs or verify determinism.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "All experiments include a zero-poisoning baseline (poisoning number 0 in Table 4, proportion 0 in Table 5), allowing comparison of poisoned vs. unpoisoned performance.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The four LLMs (GPT-4o, Llama-3-8B, CodeLlama-13B, DeepSeek-Coder-V2-16B) are contemporary as of the study period, with models selected from the LLM Safety Leaderboard as of October 2024 (§4.4.1).",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "The study systematically varies poisoning quantity (0–9 samples, 0–100% proportion), number of shots (1 vs 3), retriever type (JINA vs BM25), programming language, similarity range, and CWE type—each serving as an ablation of a different factor.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Three metrics are used: Vulnerability Rate (VR), CrystalBLEU similarity, and Vulnerability Rate in Retrieved Code (VRRC), defined in §4.5.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "§6.2 describes manual inspection where 'Two authors independently evaluated the samples through manual review' on 360 generated code samples (95+81+93+91) to validate the LLM judge's accuracy.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "No explicit separation of development and test sets is described. The same dataset appears to be used for all experiments without a held-out split.",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Extensive breakdowns are provided: by programming language (Table 7), CWE type (Tables 9, 12), similarity range (Table 8), retriever, scenario, and LLM.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "The paper discusses where poisoning is less effective: BM25 shows smaller VR increases than JINA (§5.1.1), CWE-434 has lowest VR (§5.2.3), and Scenario II is much harder for attackers (§5.1.1).",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Several negative/null results: poisoning has 'negligible' impact on code functionality (similarity metric barely changes), BM25 retriever shows minimal vulnerability increase, and low-similarity examples have 'relatively minor impact on vulnerability likelihood' (§5.2.2).",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Models are identified as 'GPT-4o', 'Llama-3-8B', 'CodeLLAMA-13B', 'DeepSeek-Coder-V2-16B'. Parameter counts are given for open-source models, but no snapshot dates or API versions are specified. 'GPT-4o' is a marketing name without a snapshot date.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Appendix provides prompt templates for query generation (Prompt 1), vulnerability extraction (Prompt 2), and security assessment (Prompt 3), but all contain placeholders ({LANGUAGE}, {FUNCTION}, {DIFF}, etc.). The actual code generation prompt used to instruct LLMs in the RACG pipeline is not provided at all.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "§4.6 reports: 'temperature of 0, top-p value of 0.95, a max_new_tokens setting of 4096, and a context window of 8192, keeping other parameters at default values.'",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (retriever + LLM generation) without agents, tools, or feedback loops.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "§4.1 documents filtering: functions shorter than three lines and names containing 'test' were removed. §4.2 describes knowledge base construction and poisoning process in detail. Table 2 provides final statistics (12,053 instances, 236 CWEs).",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "No raw data (generated code, judge outputs, poisoned knowledge bases) is made available. Only aggregated results in tables are presented.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "§4.1 describes dataset selection in detail: 12 candidate datasets evaluated against 4 criteria (Table 1), ReposVul selected as it satisfies all requirements. Filtering criteria and resulting statistics (Table 2) are documented.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants. Data comes from the ReposVul dataset, a standard publicly available vulnerability dataset.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline is documented: ReposVul → filtering (§4.1) → query generation (§4.1) → knowledge base construction (§4.2) → poisoning (§4.2.1/§4.2.2) → code generation → result validation (§4.3). Each stage has explicit criteria.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates are stated for any of the four LLMs used. This matters because ReposVul vulnerabilities may be in GPT-4o's or Llama-3's training data.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether ReposVul code examples appeared in any LLM's training data. The models may have memorized the secure or vulnerable code patterns, confounding the results.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "ReposVul is built from public repositories available before model training cutoffs. No discussion of whether models have seen these code snippets during training, which could affect both VR baseline and poisoning effectiveness.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in the study. Manual validation of the LLM judge by the authors is not a human subjects study.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. The study analyzes code generation and vulnerability propagation, not human behavior.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in the study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in the study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in the study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in the study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in the study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No inference costs, API costs, or per-example latency are reported despite using the GPT-4o API and running 16 sub-scenarios across 12,053 instances.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware is mentioned ('A100-40G GPU server') but no total compute budget (GPU hours, API spend, wall-clock time for experiments) is reported.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "Temperature is set to 0 'to reduce non-determinism' (§4.6) but no multi-seed or multi-run experiments are conducted. Determinism is assumed, not verified.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "The number of experimental runs is never explicitly stated. Results appear to be from a single run per configuration.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "Hyperparameters (temperature=0, top-p=0.95, etc.) appear to be fixed choices with no search or justification for these values beyond 'keeping other parameters at default values.'",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": true,
    391           "justification": "The paper reports results across all configurations rather than selecting a best one. For RQ2 analysis, moderate poisoning quantities are chosen with justification: 'LLM-generated code exhibits similar patterns across all metrics, regardless of poisoning quantity' (§5.2).",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "The paper makes many comparisons across 4 LLMs × 2 retrievers × 2 scenarios × multiple poisoning levels without any multiple comparison correction or even significance testing.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": true,
    403           "justification": "The authors evaluate third-party LLMs (GPT-4o, Llama-3, CodeLlama, DeepSeek-Coder) rather than their own system. The poisoning methodology is the contribution, and they report results across all models without cherry-picking favorable configurations.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": false,
    409           "justification": "Models of vastly different sizes are compared (8B vs GPT-4o) without discussing compute differences. No performance-per-compute analysis is provided.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The VR metric relies on an LLM judge with 77–81% accuracy. The paper validates the judge (§6.2) but does not discuss whether detecting vulnerability patterns via LLM actually measures real-world security risk—e.g., whether flagged vulnerabilities are exploitable.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No agentic scaffolding is used. The RAG pipeline is simple retrieval + generation without scaffolding.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "No discussion of temporal leakage. ReposVul contains vulnerabilities from public repositories that predate the LLMs' training data, meaning models may have already seen these code patterns.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether providing vulnerable code as in-context examples leaks information differently than real-world RAG settings (e.g., whether format/structure of injected examples is distinguishable).",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "No discussion of whether training data and test examples share structural similarities (same repositories, same authors, duplicate patterns) that could inflate results.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No leakage detection or prevention methods (canary strings, membership inference, decontamination) are applied.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "A single poisoned sample in the knowledge base can cause ~48% of CodeLlama-generated code to be vulnerable when using the JINA retriever (Scenario I).",
    456       "evidence": "Table 4: CodeLlama/JINA VR rises from 0.29 (baseline) to 0.48 at poisoning=1, representing a 19pp increase from 0.008% knowledge base contamination.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "Dense retrievers (JINA) amplify poisoning risk far more than sparse retrievers (BM25) due to superior retrieval of semantically relevant—and thus targeted—vulnerable examples.",
    461       "evidence": "Table 4: JINA VRRC=0.41 vs BM25 VRRC=0.06 at poisoning=5; validated by retriever effectiveness metrics in Table 11 (JINA MRR=0.85 vs BM25 MRR=0.20).",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Providing more few-shot examples in RACG increases vulnerability risk by approximately 6.5% (one-shot to three-shot with JINA retriever in Scenario I).",
    466       "evidence": "Table 6: Aggregated VR for JINA-I increases from 0.46 to 0.49 (3pp), with VRRC rising from 0.41 to 0.44; the 6.5% figure refers to the absolute VR change for JINA-I.",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "Code-specialized LLMs (CodeLlama) are more susceptible to vulnerability propagation than general-purpose LLMs of similar scale.",
    471       "evidence": "Throughout Tables 4–6, CodeLlama shows the highest VR across all configurations; however, CodeLlama (13B) is larger than Llama-3 (8B), confounding the architecture vs. size explanation.",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Example-query similarity above 60% sharply increases vulnerability risk, while similarity below 60% has minimal impact.",
    476       "evidence": "Table 8: In Scenario I, aggregated VR increases from 0.35 ([40,60)) to 0.42 ([60,80)) to 0.53 ([80,100]), with VRRC showing the same threshold effect.",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "CWE-352 (Cross-Site Request Forgery) exhibits the consistently highest vulnerability propagation rate (~0.79 in Scenario I, ~0.78 in Scenario II) among MITRE Top-10 weaknesses.",
    481       "evidence": "Table 9 confirms CWE-352 averages across all LLMs are 0.79 and 0.78 in Scenarios I and II respectively, substantially above other CWEs.",
    482       "supported": "strong"
    483     },
    484     {
    485       "claim": "The LLM-as-judge achieves 77–81% accuracy for vulnerability detection, making it an adequate substitute for manual review.",
    486       "evidence": "Table 10 reports accuracy of 0.76–0.84 (manual) and 0.79–0.82 (automated) across four languages; the paper claims this is 'commendable' without discussing how measurement error propagates to VR estimates.",
    487       "supported": "weak"
    488     }
    489   ],
    490   "methodology_tags": [
    491     "benchmark-eval",
    492     "case-study"
    493   ],
    494   "key_findings": "Knowledge base poisoning in RACG systems poses severe security risks: a single malicious code sample (0.008% of a 12,053-item knowledge base) can cause 48% of CodeLlama-generated code to contain vulnerabilities when using a dense semantic retriever. The attack is substantially more difficult without knowing developer queries (Scenario II), requiring 100% knowledge base poisoning to match Scenario I's effectiveness at 0.075% contamination. Dense retrievers create a security paradox—higher retrieval quality increases both code generation quality and vulnerability propagation rate. Vulnerability risk scales with example-query similarity (sharp threshold at 60%) and with the number of few-shot examples provided. CWE-352 (CSRF) is the most contagious vulnerability type, while Java and Python show more natural resistance than C and C++ due to built-in safety abstractions.",
    495   "red_flags": [
    496     {
    497       "flag": "No statistical significance testing",
    498       "detail": "All comparative claims (JINA vs BM25, one-shot vs three-shot, language differences, LLM differences) are presented as point estimates without any significance tests or confidence intervals, making it impossible to distinguish real effects from noise."
    499     },
    500     {
    501       "flag": "LLM judge measurement error unaccounted for",
    502       "detail": "The LLM judge has 77–81% accuracy, meaning up to 23% of vulnerability judgments are wrong. Main findings (e.g., '48% VR') are presented without correcting for or propagating this error, potentially inflating or deflating reported rates."
    503     },
    504     {
    505       "flag": "GPT-4o version not pinned",
    506       "detail": "GPT-4o is referenced without a snapshot date or API version, making results for this model non-reproducible as the model is continuously updated."
    507     },
    508     {
    509       "flag": "Training data contamination ignored",
    510       "detail": "ReposVul draws from public GitHub repositories with real CVEs that are almost certainly in LLM training data; the possibility that LLMs have memorized these vulnerability patterns is never discussed."
    511     },
    512     {
    513       "flag": "Code LLM vs general LLM confound",
    514       "detail": "CodeLlama (13B) is compared to Llama-3 (8B); the claimed explanation that code specialization causes higher VR is confounded by model size—larger models generally follow examples more closely."
    515     },
    516     {
    517       "flag": "No code or data released",
    518       "detail": "The paper is entirely non-reproducible: no experimental code, no processed dataset, no generated outputs, and no vulnerability judge implementation are released despite the appendix providing prompts."
    519     }
    520   ],
    521   "cited_papers": [
    522     {
    523       "title": "PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models",
    524       "relevance": "Closest prior work; this paper distinguishes RACG security poisoning from RAG functional poisoning studied in PoisonedRAG"
    525     },
    526     {
    527       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    528       "relevance": "Seminal study on LLM-generated code security without RAG; provides baseline for understanding inherent LLM security limitations"
    529     },
    530     {
    531       "title": "How Secure is Code Generated by ChatGPT?",
    532       "relevance": "Comparative study on LLM code security; this paper extends those findings to the RACG threat model"
    533     },
    534     {
    535       "title": "How Secure is AI-Generated Code: A Large-Scale Comparison of Large Language Models",
    536       "relevance": "Large-scale empirical benchmark of LLM code security; establishes baseline vulnerability rates for comparison"
    537     },
    538     {
    539       "title": "ReposVul: A Repository-Level High-Quality Vulnerability Dataset",
    540       "relevance": "Primary dataset used in this study; understanding its construction is essential to evaluating experimental validity"
    541     },
    542     {
    543       "title": "Code Llama: Open Foundation Models for Code",
    544       "relevance": "One of the four LLMs evaluated; code-specialized model showing highest vulnerability susceptibility"
    545     },
    546     {
    547       "title": "Retrieval Augmented Code Generation and Summarization",
    548       "relevance": "Foundational RACG paper establishing the paradigm whose security this study examines"
    549     },
    550     {
    551       "title": "CodeRAG-Bench: Can Retrieval Augment Code Generation?",
    552       "relevance": "Benchmarks RACG effectiveness; this paper adds security as a missing evaluation dimension"
    553     },
    554     {
    555       "title": "Poisoning Web-Scale Training Datasets is Practical",
    556       "relevance": "Demonstrates feasibility of the threat model's assumption that attackers can poison public code repositories"
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 2,
    562       "justification": "Provides actionable insights for anyone building RACG systems (e.g., prefer second-most-similar retrieval, monitor for specific CWEs), but no tool or defense is released."
    563     },
    564     "surprise_contrarian": {
    565       "score": 1,
    566       "justification": "The general concept that poisoned inputs lead to poisoned outputs is intuitive; the specific magnitudes (48% from a single sample) are notable but not paradigm-shifting."
    567     },
    568     "fear_safety": {
    569       "score": 3,
    570       "justification": "Demonstrates a concrete, novel attack vector on RACG code generation systems where a single malicious code example can compromise nearly half of generated code."
    571     },
    572     "drama_conflict": {
    573       "score": 1,
    574       "justification": "Raises concerns about RAG security but does not directly call out any product or company, and findings are presented in a measured academic tone."
    575     },
    576     "demo_ability": {
    577       "score": 0,
    578       "justification": "No code, demo, or tools are released. The attack and evaluation pipeline are described but not made available."
    579     },
    580     "brand_recognition": {
    581       "score": 1,
    582       "justification": "Evaluates GPT-4o (OpenAI) and other known models, but authors are from NUDT and the paper itself is not from a high-profile lab."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [],
    587     "top_points": 0,
    588     "total_points": 0,
    589     "total_comments": 0
    590   }
    591 }

Impressum · Datenschutz