ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25585B)


      1 {
      2   "paper": {
      3     "title": "CachePrune: Neural-Based Attribution Defense Against Indirect Prompt Injection Attacks",
      4     "authors": [
      5       "Rui Wang",
      6       "Junda Wu",
      7       "Yu Xia",
      8       "Tong Yu",
      9       "Ruiyi Zhang",
     10       "Ryan Rossi",
     11       "Subrata Mitra",
     12       "Lina Yao",
     13       "Julian McAuley"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv",
     17     "arxiv_id": "2504.21228"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, GitHub link, or code archive is provided in the paper. No mention of code release or availability."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses publicly available datasets: SQuAD (Rajpurkar, 2016), HotpotQA (Yang et al., 2018), and WildChat (Zhao et al., 2024), with splits from Abdelnabi et al. (2024). These are all publicly accessible benchmarks."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section listing library versions is provided."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided in the paper."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "CachePrune results are reported with ± notation across 3 trials (e.g., '7.44 ± 0.22' for ASR on SQuAD with LLama3-8B in Table 1). However, baselines do not include error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests (t-tests, Mann-Whitney, etc.) are used to validate claims that CachePrune outperforms baselines. Comparisons are based solely on point estimates."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports percentage improvements with baseline context. For example, ASR drops from 27.86% (Vanilla) to 7.44% for CachePrune on SQuAD with LLama3-8B (Table 1), giving clear magnitude context for the improvements."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses N=8 samples for neural attribution and tests on 400 prompts per dataset, but provides no justification for why these sample sizes are adequate. No power analysis is discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Results for CachePrune are reported as averages with standard deviation across 3 trials (e.g., '7.44 ± 0.22' in Table 1). Table 6 also shows variance across different N values."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares against multiple baselines: Vanilla, Delimiting, Datamarking, Sandwich, and Encode_Base64, as described in Appendix A and Table 1."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines come from recent work (Wu et al., 2023; Hines et al., 2024; Schulhoff et al., 2023). The paper explicitly justifies not comparing to training-based or test-time-workflow approaches due to computational cost differences (Appendix A)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Table 5 ablates the selective thresholding subset Phi (w/ vs w/o Phi). Table 3 ablates the number of trigger tokens k. Table 4 ablates the masking parameter alpha. Table 6 ablates the number of attribution samples N."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper reports ASR, F1 (clean), F1 (attack), and GPT-Score for WildChat. Appendix H additionally reports ROUGE-1/2/L and BERTScore (Tables 13, 14)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of defense quality or response quality is performed. All evaluation is automated via F1, ROUGE, BERTScore, and LLM-as-judge (GPT-Score)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper states 'we randomly select 8 samples from a pool of 400 prompts that are not overlapped with the testing data' (Section 4.1), indicating separation between attribution samples and test data."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per model (LLama3-8B, Mistral-7B, Phi-3.5-mini), per dataset (SQuAD, HotpotQA, WildChat), and per attack type (Table 2, Figure 5). Figure 6 shows per-layer neuron distribution."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses cases where the adaptive attack is effective (LLama3-8B near 100% ASR on baselines in Table 7), and notes HotpotQA is harder than SQuAD. The Limitations section discusses the lack of comparison with training-based approaches."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 3 shows that using more tokens (k=4, Lattr_full) for attribution actually degrades performance compared to k=1. Table 5 shows that pruning without Phi at p=5% causes severe quality degradation (F1 drops to 6.48). Delimiting and Sandwich sometimes increase ASR over Vanilla (Table 1, HotpotQA)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims CachePrune 'significantly reduce attack success rates while not compromising the response quality.' Table 1 supports this across three models and three datasets, showing ASR reductions with maintained F1/GPT-Score."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper makes causal claims that pruning task-triggering neurons causes reduction in ASR. The ablation studies (Tables 3-6) provide controlled single-variable manipulations of k, alpha, N, and Phi, adequately supporting these causal claims through systematic intervention design."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims defense against 'Indirect Prompt Injection Attacks' generally, but experiments are limited to three relatively small open-source models (3.8B-8B parameters) on question-answering and dialogue summarization tasks. No commercial models or diverse task types are tested. The paper does not bound these generalization limits."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for why CachePrune works. For example, it does not consider whether the ASR reduction could be partly due to general response degradation rather than specifically addressing instruction-following neurons, or whether the improvements are an artifact of the specific attack injection patterns used."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper specifies 'LLama3-8B', 'Mistral-7B-Instruct-V3.0', and 'Phi-3.5-mini-instruct (3.8B)' but does not provide exact model snapshot dates or version hashes. 'Mistral-7B-Instruct-V3.0' is the most specific, but LLama3-8B and Phi-3.5-mini-instruct lack precise version identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix I provides the full prompt templates used for the GPT-4 judge evaluation, including the dialogue summarization quality prompt and the attack detection prompt. The attack injection formats are also described in Appendix F."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters are reported: N=8 attribution samples, p=0.5% pruning ratio, k=1 trigger tokens, alpha=1.0 masking parameter (Section 4.1). The GPT judge uses 'gpt-4-1106-preview' (Appendix I)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "CachePrune is not an agentic scaffolding system. It is a single-pass neural attribution and pruning mechanism applied to the KV cache, with no multi-step agent workflows."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper describes using splits from Abdelnabi et al. (2024), the injection method (random injection into beginning, middle, and end of context), the WildChat modification ('You should primarily focus on this question'), and the sampling procedure (8 from 400 non-overlapping prompts)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 is titled 'Limitations' and discusses the lack of comparison with training-based defense approaches and the computational focus of the work."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The Limitations section is brief (one paragraph) and only mentions not exploring training-based defenses. It does not discuss specific threats such as the small number of models tested, the specific attack patterns used, or potential overfitting of the attribution mask to the few attribution samples."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the tested models, datasets, or attack types. The title and framing suggest general applicability without acknowledging the limited scope of evaluation."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (model outputs, attribution masks, per-example results) is available for independent verification. Only aggregate results are reported in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper describes using existing public datasets (SQuAD, HotpotQA, WildChat) with splits from Abdelnabi et al. (2024). The injection procedure and sample selection (8 from 400 non-overlapping prompts, 3 trials) are described in Section 4.1."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from dataset selection to injection to evaluation is documented: datasets from Abdelnabi et al. (2024), random injection of attack instructions, sampling 8 from 400 non-overlapping prompts, evaluation with F1/ROUGE/BERTScore/GPT-Score. Evaluation code snippets are provided in Listings 1-3."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. No mention of grants or funding sources."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Adobe Research, UC San Diego, UNSW, and CSIRO's Data61. The paper header shows institutional affiliations for all authors."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Multiple authors are from Adobe Research, which has a commercial interest in LLM security. No funding disclosure is made, so independence cannot be assessed. The paper does not evaluate Adobe products specifically, but the affiliation is relevant."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper proposes a defense mechanism rather than evaluating model knowledge on benchmarks. The evaluation measures attack success rates and response quality, not model knowledge. Contamination of training data with benchmark questions is not relevant to the defense evaluation."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests a defense mechanism, not model capability on benchmarks. Train/test contamination of the LLMs with SQuAD/HotpotQA questions is not relevant to whether the defense reduces attack success rates."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above: the paper evaluates a defense mechanism's effectiveness at reducing attack success rates, not model knowledge or capability on the benchmark tasks themselves."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper claims CachePrune is 'computationally lightweight' and does not require 'extra test-time computation per response or LLM calls' but does not report actual inference cost, latency, wall-clock time, or tokens consumed for the attribution step or pruning."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget (GPU hours, hardware used, training time for attribution) is reported. The paper mentions using only N=8 samples and a single forward pass but does not quantify the actual compute requirements."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "CachePrune significantly reduces attack success rates while not compromising response quality.",
    296       "evidence": "Table 1 shows ASR reduction from 27.86% to 7.44% on SQuAD with LLama3-8B, from 69.01% to 15.23% on HotpotQA, and from 14.50% to 2.00% on WildChat, while maintaining comparable F1 scores and GPT-Scores to Vanilla.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Neural attribution can be effectively performed with only N=8 samples.",
    301       "evidence": "Table 6 shows that N=4, 8, and 12 yield similar performance across all three models, with N=8 being the default. The ASR difference between N=4 and N=8 is small (e.g., 8.15 vs 7.44 for LLama3-8B).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Only the first k=1 token is sufficient for effective neural attribution (the triggering effect).",
    306       "evidence": "Table 3 and Figure 3 show that k=1 achieves lowest ASR (7.44%) compared to k=2 (5.57% lower ASR but worse F1) and k=4 or Lattr_full (higher ASR). Figure 3 shows clean/poisoned responses can be triggered by 1-2 tokens.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The learnt pruning mask is transferable between different attack types.",
    311       "evidence": "Table 2 shows masks learned from code-based injection can defend text-based injection and vice versa. Figure 5 shows cross-attack transferability across 4 different attack instructions, with darkest colors not necessarily on the diagonal.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "CachePrune is robust to adaptive attacks (GCG-based).",
    316       "evidence": "Tables 7-12 show that CachePrune maintains low ASR under adaptive attacks (e.g., 7.71% on LLama3-8B vs near 100% for baselines). However, the adaptive attack is less effective on Mistral and Phi models even without CachePrune.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "The selective thresholding subset Phi preserves response quality during pruning.",
    321       "evidence": "Table 5 shows that at p=5%, pruning without Phi drops F1 (clean) from 27.48 to 6.48, while pruning with Phi maintains F1 at 27.48. The effect is less pronounced at lower pruning ratios.",
    322       "supported": "strong"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval"
    327   ],
    328   "key_findings": "CachePrune defends against indirect prompt injection attacks by identifying and pruning task-triggering neurons from the KV cache using a preferential attribution loss related to DPO. The approach reduces attack success rates by several times compared to prompt-engineering baselines across three models (LLama3-8B, Mistral-7B, Phi-3.5-mini) and three datasets (SQuAD, HotpotQA, WildChat), while preserving response quality. The paper demonstrates that effective attribution requires only 8 samples and 1 trigger token, and that learned pruning masks transfer across different attack types. The approach shows robustness to GCG-based adaptive attacks, especially on LLama3-8B where baselines fail completely.",
    329   "red_flags": [
    330     {
    331       "flag": "No code release",
    332       "detail": "Despite proposing a novel defense mechanism with specific implementation details, no source code is provided. This makes independent reproduction and verification difficult."
    333     },
    334     {
    335       "flag": "No significance testing",
    336       "detail": "Comparative claims between CachePrune and baselines are made without statistical significance tests. Baselines are reported as single-run point estimates while CachePrune reports means over 3 trials, making direct comparison asymmetric."
    337     },
    338     {
    339       "flag": "Limited model scale",
    340       "detail": "All experiments use relatively small open-source models (3.8B-8B parameters). No experiments on larger models or commercial APIs are conducted, yet the paper's framing implies general applicability."
    341     },
    342     {
    343       "flag": "No compute cost reported",
    344       "detail": "Despite claiming the approach is 'computationally lightweight', no actual compute costs, GPU hours, or wall-clock times are reported for the attribution step."
    345     },
    346     {
    347       "flag": "Corporate affiliation without disclosure",
    348       "detail": "Multiple authors are from Adobe Research. No funding disclosure, competing interests statement, or acknowledgments section is present."
    349     },
    350     {
    351       "flag": "Minimal limitations section",
    352       "detail": "The Limitations section (Section 6) is a single short paragraph that only mentions not exploring training-based approaches, without discussing specific threats to the validity of the results."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    358       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    359       "year": 2023,
    360       "relevance": "Foundational work on indirect prompt injection attacks against LLMs, directly relevant to LLM security research."
    361     },
    362     {
    363       "title": "StruQ: Defending against prompt injection with structured queries",
    364       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    365       "year": 2024,
    366       "arxiv_id": "2402.06363",
    367       "relevance": "Training-based defense against prompt injection using structured queries, relevant baseline for LLM security methods."
    368     },
    369     {
    370       "title": "Aligning LLMs to be robust against prompt injection",
    371       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    372       "year": 2024,
    373       "arxiv_id": "2410.05451",
    374       "relevance": "Alignment-based approach to defending against prompt injection, relevant to LLM safety and alignment research."
    375     },
    376     {
    377       "title": "Defending against indirect prompt injection attacks with spotlighting",
    378       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    379       "year": 2024,
    380       "arxiv_id": "2403.14720",
    381       "relevance": "Prompt-engineering defense against indirect prompt injection, directly comparable baseline method."
    382     },
    383     {
    384       "title": "FATH: Authentication-based test-time defense against indirect prompt injection attacks",
    385       "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z Morley Mao", "Muhao Chen", "Chaowei Xiao"],
    386       "year": 2024,
    387       "arxiv_id": "2410.21492",
    388       "relevance": "Test-time defense approach against prompt injection using authentication, relevant to the defense methodology landscape."
    389     },
    390     {
    391       "title": "The Task Shield: Enforcing task alignment to defend against indirect prompt injection in LLM agents",
    392       "authors": ["Feiran Jia", "Tong Wu", "Xin Qin", "Anna Squicciarini"],
    393       "year": 2024,
    394       "arxiv_id": "2412.16682",
    395       "relevance": "Defense mechanism for LLM agents against indirect prompt injection through task alignment enforcement."
    396     },
    397     {
    398       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    399       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    400       "year": 2024,
    401       "relevance": "Task-specific finetuning approach to prompt injection defense, relevant to training-based defense methods."
    402     },
    403     {
    404       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    405       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    406       "year": 2023,
    407       "arxiv_id": "2312.14197",
    408       "relevance": "Benchmark for indirect prompt injection attacks and defenses, relevant to evaluation methodology in LLM security."
    409     },
    410     {
    411       "title": "Are you still on track!? Catching LLM task drift with activations",
    412       "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"],
    413       "year": 2024,
    414       "arxiv_id": "2406.00799",
    415       "relevance": "Activation-based detection of prompt injection task drift, the source of datasets used in this paper's evaluation."
    416     },
    417     {
    418       "title": "Direct preference optimization: Your language model is secretly a reward model",
    419       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    420       "year": 2024,
    421       "relevance": "Foundation for the preferential attribution loss used in CachePrune, relevant to LLM alignment techniques."
    422     },
    423     {
    424       "title": "Universal and transferable adversarial attacks on aligned language models",
    425       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    426       "year": 2023,
    427       "arxiv_id": "2307.15043",
    428       "relevance": "GCG attack used for adaptive attack evaluation in this paper, fundamental to LLM adversarial robustness research."
    429     },
    430     {
    431       "title": "Automatic and universal prompt injection attacks against large language models",
    432       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    433       "year": 2024,
    434       "arxiv_id": "2403.04957",
    435       "relevance": "Automatic prompt injection attack method used as the adaptive attack baseline in this paper."
    436     }
    437   ]
    438 }

Impressum · Datenschutz