scan-v5.json (24375B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "EmbedGuard: Cross-Layer Detection and Provenance Attestation for Adversarial Embedding Attacks in RAG Systems", 6 "authors": [ 7 "Neeraj Kumar Singh Beshane" 8 ], 9 "year": 2026, 10 "venue": "International Journal of Computational and Experimental Science and ENgineering (IJCESEN)", 11 "arxiv_id": null, 12 "doi": "10.22399/ijcesen.4869" 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": false, 19 "justification": "The abstract claims specific detection rates (94.7%, 89.3%) and improvement over named baselines (RAGuard, RobustRAG, TrustRAG), but the references for these baselines are completely wrong papers — [6] is an LLM long-tail knowledge paper, [7] is a backdoor defense paper, [5] is an adversarial attacks paper — making the comparative claims unverifiable.", 20 "source": "haiku" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper claims ablation studies justify the 18.4pp improvement from cross-layer correlation, but no confidence intervals, statistical tests, or variance across runs are reported; a single-point result without uncertainty bounds is insufficient for causal inference.", 26 "source": "haiku" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper generalizes findings to healthcare, finance, legal, and equity-for-small-organizations contexts based on a single undescribed evaluation corpus; the corpus provenance is never specified and no domain-specific evaluations are conducted.", 32 "source": "haiku" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": false, 37 "justification": "No alternative explanations are considered for the performance results; the paper presents only one interpretation of every finding without discussing confounds such as attack strength calibration or evaluation data selection.", 38 "source": "haiku" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper measures detection rate, false positive rate, and latency, which directly correspond to the security claims made; there is no conflation of proxy metrics with higher-order outcomes.", 44 "source": "haiku" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": false, 51 "justification": "There is no dedicated limitations or threats-to-validity section; the paper proceeds from experimental evaluation (Section 4) directly to applications (Section 5) and conclusions (Section 6).", 52 "source": "haiku" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": false, 57 "justification": "No threats to validity are discussed anywhere in the paper; the discussion of limitations is entirely absent.", 58 "source": "haiku" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper does not state what the results do NOT show; instead it expands scope in Section 5 to broad societal implications without any bounding of claims.", 64 "source": "haiku" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": true, 71 "justification": "The author statements explicitly declare 'there is no funding to be acknowledged.'", 72 "source": "haiku" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "The single author is listed as 'Independent Researcher, California, USA' with no institutional affiliation.", 78 "source": "haiku" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": false, 82 "answer": false, 83 "justification": "No funding is declared, so funder independence is not applicable.", 84 "source": "haiku" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": true, 89 "justification": "The author statements declare 'no known competing financial interests or personal relationships that could have appeared to influence the work.'", 90 "source": "haiku" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "Key terms including 'embedding space poisoning,' 'cross-layer detection,' 'TEE,' and 'cryptographic attestation' are explained in context; RAG system architecture is introduced clearly.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 1.3 lists five numbered contributions explicitly, covering architecture, cryptographic attestation, production-scale evaluation, ablation, and deployment modes.", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": false, 109 "justification": "Section 2 discusses RAGuard, RobustRAG, and TrustRAG as prior defenses, but the citations ([5], [6], [7]) point to completely unrelated papers; reference [10] claims to support 'query-efficient adversarial testing' but actually cites a zero-knowledge proofs cryptography paper — the prior work engagement is fundamentally misrepresented.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "empirical": { 116 "artifacts": { 117 "code_released": { 118 "applies": true, 119 "answer": false, 120 "justification": "No code is released; the author statements say data is 'available on request from the corresponding author' and 'not publicly available due to privacy or ethical restrictions.'", 121 "source": "haiku" 122 }, 123 "data_released": { 124 "applies": true, 125 "answer": false, 126 "justification": "The 500,000-embedding corpus and 47,000-query workload are described as available on request only, not publicly released.", 127 "source": "haiku" 128 }, 129 "environment_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Hardware specs (AMD EPYC 7542, AMD SEV-SNP) and the embedding model (all-mpnet-base-v2) are named, but no software dependencies, Python version, requirements file, or Docker configuration is provided.", 133 "source": "haiku" 134 }, 135 "reproduction_instructions": { 136 "applies": true, 137 "answer": false, 138 "justification": "No step-by-step reproduction instructions are provided; the paper describes the architecture but gives no runnable procedure for replicating any result.", 139 "source": "haiku" 140 } 141 }, 142 "statistical_methodology": { 143 "confidence_intervals_or_error_bars": { 144 "applies": true, 145 "answer": false, 146 "justification": "No confidence intervals or error bars are reported for any result in Tables 3-5; all numbers are single point estimates.", 147 "source": "haiku" 148 }, 149 "significance_tests": { 150 "applies": true, 151 "answer": false, 152 "justification": "No statistical significance tests are used for any comparative claim between EmbedGuard and the three baseline systems.", 153 "source": "haiku" 154 }, 155 "effect_sizes_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Percentage point improvements are explicitly reported: 18.4pp improvement from cross-layer correlation over best single-layer, 27.9-35.1pp advantage over baselines under adaptive attacks.", 159 "source": "haiku" 160 }, 161 "sample_size_justified": { 162 "applies": true, 163 "answer": false, 164 "justification": "The 500,000 embeddings and 47,000 queries are presented as given without justification for why these sizes were chosen or any power analysis.", 165 "source": "haiku" 166 }, 167 "variance_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "No standard deviation or variance across runs is reported; all metrics appear as single-run point estimates.", 171 "source": "haiku" 172 } 173 }, 174 "evaluation_design": { 175 "baselines_included": { 176 "applies": true, 177 "answer": true, 178 "justification": "Three baselines are included (RAGuard, RobustRAG, TrustRAG) in Table 5, though the citations for these systems point to completely unrelated papers.", 179 "source": "haiku" 180 }, 181 "baselines_contemporary": { 182 "applies": true, 183 "answer": false, 184 "justification": "The baseline systems cannot be verified as contemporary or even correct — the citation for 'RAGuard' ([6]) is actually 'Large Language Models Struggle to Learn Long-Tail Knowledge' (2023), and 'RobustRAG' ([5]) is an adversarial attacks paper; the actual baseline implementations cannot be confirmed.", 185 "source": "haiku" 186 }, 187 "ablation_study": { 188 "applies": true, 189 "answer": true, 190 "justification": "Table 3 (ablation study) presents detection rates for all combinations of layer removal, showing the contribution of each layer.", 191 "source": "haiku" 192 }, 193 "multiple_metrics": { 194 "applies": true, 195 "answer": true, 196 "justification": "Detection rate, false positive rate, mean latency, and P99 latency are all reported per attack category.", 197 "source": "haiku" 198 }, 199 "human_evaluation": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human evaluation is relevant for this automated attack detection system.", 203 "source": "haiku" 204 }, 205 "held_out_test_set": { 206 "applies": true, 207 "answer": false, 208 "justification": "The correlation engine weights are calibrated on 5,000 'held-out attack samples,' but the relationship between this calibration set and the evaluation attack categories is not clearly delineated; the origin and independence of all attack data is undescribed.", 209 "source": "haiku" 210 }, 211 "per_category_breakdown": { 212 "applies": true, 213 "answer": true, 214 "justification": "Table 3 provides per-attack-category breakdowns for detection rate, false positive rate, and latency across five distinct attack types.", 215 "source": "haiku" 216 }, 217 "failure_cases_discussed": { 218 "applies": true, 219 "answer": false, 220 "justification": "No failure cases are shown or discussed; the paper does not analyze the 5.3-10.7% of attacks that evade detection.", 221 "source": "haiku" 222 }, 223 "negative_results_reported": { 224 "applies": true, 225 "answer": false, 226 "justification": "No negative results are reported; all experiments show EmbedGuard outperforming all baselines on all metrics.", 227 "source": "haiku" 228 } 229 }, 230 "setup_transparency": { 231 "model_versions_specified": { 232 "applies": true, 233 "answer": true, 234 "justification": "The embedding model 'all-mpnet-base-v2 (768 dimensions)' is specified by name, and the DistilBERT classifier training setup (156,000 query pairs) is described, though no HuggingFace commit hash or snapshot date is given.", 235 "source": "haiku" 236 }, 237 "prompts_provided": { 238 "applies": false, 239 "answer": false, 240 "justification": "EmbedGuard is a detection framework rather than a prompt-based generative system; no LLM prompts are central to the evaluation design.", 241 "source": "haiku" 242 }, 243 "hyperparameters_reported": { 244 "applies": true, 245 "answer": true, 246 "justification": "Key hyperparameters are reported: ProjGrad learning rate 0.01, 500 iterations, k=50 PCA components, K=5 perturbation sets, ±0.05 cosine distance threshold, correlation weights β₁-β₄.", 247 "source": "haiku" 248 }, 249 "scaffolding_described": { 250 "applies": true, 251 "answer": true, 252 "justification": "The four-layer architecture and correlation engine are described in detail in Sections 3.2-3.6 including mathematical formulations for each detection mechanism.", 253 "source": "haiku" 254 }, 255 "data_preprocessing_documented": { 256 "applies": true, 257 "answer": false, 258 "justification": "The corpus is described only as 'spanning technical documentation, medical literature, legal texts, and encyclopedic knowledge' with no description of how documents were obtained, filtered, or preprocessed.", 259 "source": "haiku" 260 } 261 }, 262 "data_integrity": { 263 "raw_data_available": { 264 "applies": true, 265 "answer": false, 266 "justification": "Data is explicitly stated to be 'not publicly available due to privacy or ethical restrictions' and available only on request.", 267 "source": "haiku" 268 }, 269 "data_collection_described": { 270 "applies": true, 271 "answer": false, 272 "justification": "The corpus composition is mentioned at a high level (500,000 embeddings across domains) but how documents were collected, what sources were used, and how the 47,000 evaluation queries were generated are not described.", 273 "source": "haiku" 274 }, 275 "recruitment_methods_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants were involved; the evaluation uses document corpora and synthetic attack generation.", 279 "source": "haiku" 280 }, 281 "data_pipeline_documented": { 282 "applies": true, 283 "answer": false, 284 "justification": "The data pipeline from document collection through embedding generation to attack simulation is not documented; the origin of the 156,000 adversarial-benign training pairs for the DistilBERT classifier is not disclosed.", 285 "source": "haiku" 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "The all-mpnet-base-v2 embedding model's training data cutoff is not stated; the DistilBERT classifier's training data provenance and temporal relationship to test data are not described.", 293 "source": "haiku" 294 }, 295 "train_test_overlap_discussed": { 296 "applies": true, 297 "answer": false, 298 "justification": "No discussion of whether the DistilBERT classifier's 156,000 training pairs overlap with the evaluation attack datasets is present.", 299 "source": "haiku" 300 }, 301 "benchmark_contamination_addressed": { 302 "applies": true, 303 "answer": false, 304 "justification": "The evaluation attacks are synthetic but derived from methods in prior literature; whether those attack patterns were available during the classifier's training is not addressed.", 305 "source": "haiku" 306 } 307 }, 308 "human_studies": { 309 "pre_registered": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants involved.", 313 "source": "haiku" 314 }, 315 "irb_or_ethics_approval": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants; author statements confirm 'research is not related to either human or animal use.'", 319 "source": "haiku" 320 }, 321 "demographics_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants involved.", 325 "source": "haiku" 326 }, 327 "inclusion_exclusion_criteria": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants involved.", 331 "source": "haiku" 332 }, 333 "randomization_described": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants involved.", 337 "source": "haiku" 338 }, 339 "blinding_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants involved.", 343 "source": "haiku" 344 }, 345 "attrition_reported": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants involved.", 349 "source": "haiku" 350 } 351 }, 352 "cost_and_practicality": { 353 "inference_cost_reported": { 354 "applies": true, 355 "answer": true, 356 "justification": "Mean latency (51ms) and P99 latency are reported per attack category, and per-layer latency breakdown is provided in Table 4; TEE attestation overhead (12.8ms per document ingestion) is also quantified.", 357 "source": "haiku" 358 }, 359 "compute_budget_stated": { 360 "applies": true, 361 "answer": false, 362 "justification": "Hardware specs are listed (AMD EPYC 7542, 256GB RAM) but total GPU/CPU hours or cost for running the evaluation are not stated.", 363 "source": "haiku" 364 } 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "EmbedGuard achieves 94.7% detection rate for optimization-based attacks with 3.2% false positive rate", 371 "evidence": "Table 3 reports 94.7% detection, 3.2% FP rate, 47ms mean latency for 12,500 optimization-based attacks", 372 "supported": "weak" 373 }, 374 { 375 "claim": "Cross-layer correlation provides 18.4 percentage point improvement over the best single-layer approach", 376 "evidence": "Ablation Table 3 shows full system at 94.7% vs embedding-only at 76.3%; difference is 18.4pp", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "EmbedGuard maintains 89.3% detection under adaptive attacks vs 54.2-61.4% for single-layer defenses", 381 "evidence": "Table 5 comparative results; however the cited baseline papers (RAGuard [6], RobustRAG [5], TrustRAG [7]) reference completely unrelated publications", 382 "supported": "unsupported" 383 }, 384 { 385 "claim": "TEE attestation provides deterministic 100% true positive rate for direct embedding injection attacks", 386 "evidence": "Claimed in Section 3.3 as a property of cryptographic verification; no empirical test of this specific scenario is reported", 387 "supported": "weak" 388 }, 389 { 390 "claim": "The framework operates with 51ms mean latency overhead, within acceptable sub-100ms bounds for production", 391 "evidence": "Table 4 per-layer latency breakdown sums to 51ms; P99 latency is 142-171ms depending on attack type", 392 "supported": "moderate" 393 } 394 ], 395 "methodology_tags": [ 396 "benchmark-eval", 397 "case-study" 398 ], 399 "key_findings": "EmbedGuard proposes a four-layer cross-layer detection framework combining TEE-based cryptographic embedding attestation with statistical anomaly detection (PCA, KL divergence, output perturbation stability) for RAG system security. Evaluated on a claimed production-scale corpus (500K embeddings, 47K queries), the framework reportedly achieves 94.7% detection of optimization-based attacks and maintains 89.3% detection against adaptive attacks, with a 51ms mean latency overhead. However, the paper contains systematic citation fabrication: the three named baseline systems (RAGuard, RobustRAG, TrustRAG) are cited against completely unrelated papers, no code or data is publicly released, and no statistical uncertainty is reported on any result, making independent verification of all claims impossible.", 400 "red_flags": [ 401 { 402 "flag": "Systematic citation fabrication for baselines", 403 "detail": "RAGuard is cited as [6] but [6] is 'LLMs Struggle to Learn Long-Tail Knowledge' (Kandpal et al.); RobustRAG is cited as [5] but [5] is a paper on adversarial attacks on aligned LLMs; TrustRAG is cited as [7] but [7] is a backdoor defense paper; reference [10] is cited to support 'query-efficient adversarial testing via Bayesian optimization' but actually cites a zero-knowledge proofs cryptography paper. The comparative evaluation cannot be verified." 404 }, 405 { 406 "flag": "No code or data released", 407 "detail": "Source code is not released and the 500K-embedding corpus is 'not publicly available due to privacy or ethical restrictions,' making all quantitative claims unverifiable." 408 }, 409 { 410 "flag": "No statistical uncertainty on any result", 411 "detail": "Zero confidence intervals, error bars, or significance tests appear in the paper; all detection rates are single-point estimates with no variance reported across runs." 412 }, 413 { 414 "flag": "Single independent researcher, low-tier venue", 415 "detail": "Systems-security work of claimed production scale (500K embeddings, TEE integration) published by a single independent researcher with no institutional affiliation in IJCESEN, a low-quality journal, without any code or data release." 416 }, 417 { 418 "flag": "No limitations section", 419 "detail": "The paper contains no limitations, threats-to-validity, or scope-bounding section; Section 5 instead expands scope to speculative societal implications." 420 }, 421 { 422 "flag": "Undisclosed training data for DistilBERT classifier", 423 "detail": "The prompt-layer classifier is trained on '156,000 adversarial-benign query pairs from recent prompt injection datasets' but the source of these pairs is not cited or described, and potential overlap with the test attack datasets is not discussed." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models", 429 "relevance": "Foundational attack paper motivating the threat model; demonstrates <1% corpus contamination achieving >80% attack success" 430 }, 431 { 432 "title": "Prompt Injection Attack against LLM-integrated Applications", 433 "relevance": "Characterizes prompt injection threats that Layer 1 of EmbedGuard targets" 434 }, 435 { 436 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 437 "relevance": "Adversarial suffix attacks informing the transferability attack category in evaluation" 438 }, 439 { 440 "title": "Defending against Backdoor Attacks in Natural Language Generation", 441 "relevance": "Backdoor defense techniques related to the output consistency verification layer" 442 }, 443 { 444 "title": "Query-Efficient Black-Box Red Teaming via Bayesian Optimization", 445 "relevance": "Adaptive attack methodology using Bayesian optimization to probe deployed defenses" 446 } 447 ], 448 "engagement_factors": { 449 "practical_relevance": { 450 "score": 2, 451 "justification": "RAG system security is a genuine production concern, but no code is released and baselines cannot be verified, limiting practitioner utility." 452 }, 453 "surprise_contrarian": { 454 "score": 1, 455 "justification": "Cross-layer defense is a sensible architectural insight but not a surprising finding; the hardware attestation angle is novel framing." 456 }, 457 "fear_safety": { 458 "score": 2, 459 "justification": "Embedding poisoning attacks on production RAG systems with <1% contamination achieving >80% success rate is a genuine and underappreciated threat." 460 }, 461 "drama_conflict": { 462 "score": 1, 463 "justification": "Standard attacker-defender framing with adaptive attack evaluation; no controversy angle beyond the security domain." 464 }, 465 "demo_ability": { 466 "score": 0, 467 "justification": "No code, no demo, no public data; nothing to try." 468 }, 469 "brand_recognition": { 470 "score": 0, 471 "justification": "Independent researcher, unknown journal; no affiliation with a recognized lab or product." 472 } 473 }, 474 "hn_data": { 475 "threads": [], 476 "top_points": 0, 477 "total_points": 0, 478 "total_comments": 0 479 } 480 }