ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24518B)


      1 {
      2   "paper": {
      3     "title": "CAPTURE: Context-Aware Prompt Injection Testing and Robustness Enhancement",
      4     "authors": ["Gauri Kholkar", "Ratinder Ahuja"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2505.12368",
      8     "doi": "10.48550/arXiv.2505.12368"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "The paper states 'The dataset generation pipeline code will be shared upon publication' (footnote 1, Section 1). A promise of future release counts as NO per the schema."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No link to the generated CAPTURE datasets is provided in the paper. The datasets (MALICIOUS-GEN, SAFE-GEN) are described but not made available."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using DeBERTaV3-base and GPT-4o but does not specify library versions or a reproducible environment setup."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided. While the data generation pipeline is described at a high level (Sections 2.1-2.3), there are no runnable commands, scripts, or a README with reproduction steps."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Tables 2, 3, and 6 are reported as point estimates (e.g., '0.15%', '79.04%') with no confidence intervals, error bars, or uncertainty quantification."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims (e.g., CaptureGuard outperforms others) but uses no statistical significance tests. Differences are assessed purely by comparing raw FNR/FPR numbers."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports raw FNR and FPR percentages but does not report formal effect sizes (e.g., Cohen's d, odds ratios). While percentage differences are shown, there is no baseline-contextualized effect size measure."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The dataset sizes (1274 training, 641 test/validation for MALICIOUS-GEN; 339 training, 171 test/validation for SAFE-GEN) are stated but not justified. No power analysis or rationale for these specific sizes is given."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported. CaptureGuard models appear to be trained once per domain with single-run results reported."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper evaluates five specialized guardrail models (ProtectAIv2, InjecGuard, PromptGuard, Deepset, Fmops) and two LLMs (GPT-4o, Llama3.2-1B-Instruct) as baselines (Section 3, Tables 2 and 3)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The baselines include contemporary models: InjecGuard (2024), PromptGuard (Meta, 2024), ProtectAIv2 (2024), GPT-4o (2024), and Llama3.2-1B-Instruct (Meta, 2024). These represent current state-of-the-art prompt injection detectors."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is conducted. The paper does not analyze the individual contributions of the Framework, Separator, or Disruptor components, nor does it ablate the training data composition for CaptureGuard."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses FNR (False Negative Rate) and FPR (False Positive Rate) as two complementary metrics (Tables 2, 3), plus accuracy on external benchmarks (Table 6)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Human validation is mentioned briefly ('human validation showed approximately 90% agreement with its malicious/benign classifications' in Section 3), but no detailed human evaluation of the system's outputs or the benchmark quality is presented. The details of this validation (who, how many, protocol) are absent."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper describes explicit train/test/validation splits: '30 train, 15 test, 15 validation examples' for base sets (Section 2.1), with separate test sets used for evaluation in Tables 2, 3, and external benchmarks in Table 6."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by domain (Stock, Movies, Python, Travel, Covid, Shopping) across all six domains in Tables 2 and 3, and by benchmark (NotInject, WildGuard, BIPIA) in Table 6."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No qualitative failure analysis is provided. The paper reports aggregate FNR/FPR numbers but does not examine specific examples where CaptureGuard fails, nor analyze patterns in its errors."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that CaptureGuard underperforms InjecGuard on NotInject benchmark (79.04% vs 87.31%) and on BIPIA Injection (54.77% vs 68.34%), as shown in Table 6. The paper acknowledges 'slightly lower than InjecGuard's 87.32%, indicating a marginal trade-off in benign prompt detection.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims that existing models 'suffer from high false negatives in adversarial cases and excessive false positives in benign scenarios' — supported by Tables 2-3 (e.g., Fmops 100% FNR, PromptGuard 100% FPR). The claim that CaptureGuard 'drastically reduces both false negative and false positive rates' is supported by its near-zero FNR and low FPR in Table 2."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "The paper claims CaptureGuard's performance improvement results from 'context-aware training data' (Section 3), implying a causal link. However, no ablation isolates this factor — CaptureGuard also uses InjecGuard's 26 existing datasets alongside CAPTURE data, making it unclear which data source drives the improvement."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract and conclusion make broad claims about 'more robust and practical prompt injection defenses' and 'a clear methodology...to advance the field,' but CaptureGuard was only trained and tested on three domains (Python, Movies, Stocks) out of six. The paper's title implies general context-awareness but results are limited to specific domain configurations."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The Limitations section (Section 5) discusses the key alternative explanation: GPT-4o was used for both data generation and as an evaluation baseline, introducing potential bias where 'generated data may inadvertently reflect the stylistic and logical patterns of the generator model, potentially giving GPT-4o an advantage in detection.'"
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'GPT-4o' and 'Llama3.2-1B-Instruct' without specifying API versions or snapshot dates. GPT-4o behavior changes across versions, so 'GPT-4o' alone is insufficient. No version date or API endpoint is provided."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Full prompt texts are provided in the Appendix (Figures 2-5) for all key generation and evaluation tasks: SAFE-GEN generation (Figure 2), S/D identification (Figure 3), subtle separator generation (Figure 4), and LLM evaluation (Figure 5)."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Table 4 reports temperature settings for each GPT-4o task (0, 0.5, 0.7). Table 5 provides CaptureGuard hyperparameters: batch size 32, learning rate 2e-5, max sequence length 64, optimizer Adam, 1 epoch, classification threshold 0.5."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The system is a data generation pipeline plus standard classification model training, not an agentic workflow."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The data generation pipeline is documented in detail across Sections 2.1-2.3: sourcing from public QA datasets, expansion via GPT-4o (100 examples per domain per split), decomposition of attacks into S and D components, refinement of separators, and final assembly. Dataset sizes at each stage are provided in Table 4."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5 'LIMITATIONS' is a dedicated limitations section with substantive discussion of scope, attack diversity constraints, and generator model bias."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The Limitations section identifies specific threats: (1) focus limited to 'direct, single-turn prompt injections' excluding 'indirect and multi-turn attacks,' (2) 'reliance on a single powerful model, GPT-4o, for both data generation and as an evaluation baseline' creating potential bias, (3) 'attack diversity is constrained by the source datasets used.'"
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Limitations section explicitly states what is not tested: indirect attacks, multi-turn attacks, and non-conversational LLM applications. It states the scope is limited to 'direct, single-turn prompt injections' and acknowledges the benchmark is 'not limited to conversational LLM applications' as a future goal."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The generated datasets (MALICIOUS-GEN, SAFE-GEN) are not released. Only example prompts are shown in Table 1. The raw data is not available for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The data collection process is described in detail: source QA datasets are named (Shopping, Covid, Movies, Stock, Travel, Python Code with citations), the expansion process via GPT-4o is documented (Section 2.1), and attack source datasets are cited (Erdogan et al., 2024; Schulhoff et al., 2023; Yugen.ai, 2023)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited. The data is entirely synthetic, generated from public datasets and GPT-4o."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline from source data to final datasets is documented: (1) source QA datasets → (2) GPT-4o expansion to 100 per domain/split → (3) attack decomposition into S/D → (4) separator refinement → (5) assembly into context-aware prompts. Dataset sizes are given at each stage (Section 2.1-2.3, Table 4)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding disclosure or acknowledgments section is present in the paper. Both authors are affiliated with Pure Storage."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: both Gauri Kholkar and Ratinder Ahuja are affiliated with Pure Storage, with institutional email addresses provided."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding is disclosed, so independence cannot be assessed. The work comes from Pure Storage, a cloud storage company. While Pure Storage does not appear to be a direct competitor to the evaluated guardrail models, the lack of any funding disclosure means this criterion is not satisfied."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper evaluates GPT-4o and Llama3.2-1B-Instruct as prompt injection detectors, and uses GPT-4o for data generation, but does not state the training data cutoff for any model. This is relevant because GPT-4o may have seen the source attack datasets in its training data."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether the LLMs (GPT-4o, Llama3.2) may have seen the source attack datasets (HackAPrompt, Safe-Guard, etc.) during pre-training. The paper also does not address whether GPT-4o's data generation creates a favorable evaluation setting for itself beyond the brief mention in limitations."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The external benchmarks used for evaluation (NotInject, WildGuard, BIPIA) were published before GPT-4o's likely training cutoff, but this contamination risk is not discussed. The paper does not address whether GPT-4o may have seen these benchmarks during training."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study. The data is entirely synthetic."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost, latency, or tokens consumed is reported. The paper uses GPT-4o for data generation and evaluation but does not quantify the associated API costs."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The paper does not report GPU hours for CaptureGuard training, total API spend for GPT-4o usage, or hardware specifications."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Existing prompt guardrail models suffer from high false negatives in context-aware adversarial settings (e.g., Fmops achieves 100% FNR across all domains).",
    287       "evidence": "Tables 2 and 3 show Fmops with 100% FNR across all six domains, InjecGuard with 99.84-100% FNR in most domains, and Llama3.2-1B-Instruct with 58-76% FNR.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Existing guardrail models exhibit excessive false positive rates on benign context-aware prompts (e.g., PromptGuard achieves 100% FPR in Stock and Movies).",
    292       "evidence": "Tables 2 and 3 show PromptGuard with 100% FPR in Stock, Movies, Travel, Covid, and Shopping domains, and InjecGuard with 98-100% FPR in most domains.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "CaptureGuard achieves near-zero FNR (0.00-0.15%) and low FPR (0.00-2.05%) on the CAPTURE benchmark.",
    297       "evidence": "Table 2 shows CaptureGuard FNR of 0.15%, 0.00%, 0.00% and FPR of 0.00%, 2.05%, 2.05% for Stock, Movies, and Python domains respectively.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "CaptureGuard generalizes effectively to external benchmarks, achieving competitive performance.",
    302       "evidence": "Table 6 shows CaptureGuard achieving 79.04% on NotInject, 75.00% on WildGuard, and 54.77% on BIPIA Injection. However, it underperforms InjecGuard on all three benchmarks (87.31%, 76.11%, 68.34% respectively).",
    303       "supported": "weak"
    304     }
    305   ],
    306   "methodology_tags": ["benchmark-eval"],
    307   "key_findings": "CAPTURE reveals that existing prompt injection guardrail models are highly vulnerable to context-aware attacks, with models like Fmops showing 100% false negative rates and PromptGuard showing 100% false positive rates. The proposed CaptureGuard model, trained on CAPTURE's context-aware generated data, achieves near-zero FNR and FPR on the in-domain benchmark but shows weaker generalization to external benchmarks (NotInject, WildGuard, BIPIA), where it underperforms InjecGuard. The work highlights a fundamental tradeoff between attack detection sensitivity and over-defense in existing guardrail models.",
    308   "red_flags": [
    309     {
    310       "flag": "Generator-evaluator conflation",
    311       "detail": "GPT-4o is used both to generate the benchmark data and as an evaluation baseline. The paper acknowledges this but does not adequately control for it — GPT-4o's strong performance on detection (low FNR, low FPR) may partly reflect familiarity with its own generated patterns rather than genuine detection capability."
    312     },
    313     {
    314       "flag": "In-domain evaluation advantage",
    315       "detail": "CaptureGuard's impressive near-zero FNR/FPR is measured on the same CAPTURE benchmark it was trained on (same domains, same generation pipeline). On external benchmarks (Table 6), CaptureGuard underperforms InjecGuard on all three tests, suggesting limited generalization despite claims of 'generalizing effectively.'"
    316     },
    317     {
    318       "flag": "No uncertainty quantification",
    319       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or variance across multiple training seeds. Given the relatively small test set sizes (e.g., 171 safe samples), individual classification decisions can substantially shift FPR."
    320     },
    321     {
    322       "flag": "Missing ablation",
    323       "detail": "CaptureGuard is trained on both CAPTURE-generated data AND InjecGuard's 26 existing datasets. No ablation isolates the contribution of the context-aware CAPTURE data versus the existing datasets, making the causal claim about context-aware training data unsubstantiated."
    324     },
    325     {
    326       "flag": "Code and data not released",
    327       "detail": "Despite claims of providing 'a clear methodology and a powerful baseline to advance the field,' neither the code nor the datasets are released. The promise of future release upon publication does not allow independent verification."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    333       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    334       "year": 2023,
    335       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly relevant to prompt injection security research."
    336     },
    337     {
    338       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    339       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    340       "year": 2024,
    341       "relevance": "Systematic formalization and benchmarking of prompt injection attacks and defenses, published at USENIX Security."
    342     },
    343     {
    344       "title": "InjecGuard: Benchmarking and mitigating over-defense in prompt injection guardrail models",
    345       "authors": ["Hao Li", "Xiaogeng Liu"],
    346       "year": 2024,
    347       "arxiv_id": "2410.22770",
    348       "relevance": "Key baseline for prompt injection detection that identifies the over-defense problem; provides the training framework adopted by CaptureGuard."
    349     },
    350     {
    351       "title": "Prompt injection attack against llm-integrated applications",
    352       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    353       "year": 2023,
    354       "arxiv_id": "2306.05499",
    355       "relevance": "Introduces the Framework-Separator-Disruptor attack structure for context-aware prompt injection that CAPTURE builds upon."
    356     },
    357     {
    358       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    359       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin"],
    360       "year": 2024,
    361       "arxiv_id": "2312.17673",
    362       "relevance": "Proposes task-specific fine-tuning as a defense against prompt injection, relevant to evaluating defense strategies."
    363     },
    364     {
    365       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    366       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman"],
    367       "year": 2023,
    368       "arxiv_id": "2312.14197",
    369       "relevance": "Benchmarking indirect prompt injection attacks and defenses on LLMs, directly relevant to the survey's coverage of LLM security evaluation."
    370     },
    371     {
    372       "title": "PromptShield: Deployable detection for prompt injection attacks",
    373       "authors": ["Dennis Jacob", "Hend Alzahrani", "Zhanhao Hu", "Basel Alomair", "David Wagner"],
    374       "year": 2025,
    375       "arxiv_id": "2501.15145",
    376       "relevance": "Recent prompt injection detection system relevant to evaluating the state of deployable guardrail defenses."
    377     },
    378     {
    379       "title": "Ignore this title and hackaprompt: Exposing systemic vulnerabilities of llms through a global prompt hacking competition",
    380       "authors": ["Sander Schulhoff", "Jeremy Pinto", "Anaum Khan"],
    381       "year": 2023,
    382       "relevance": "Large-scale prompt hacking competition providing attack techniques and datasets used as source data for CAPTURE's MALICIOUS-GEN pipeline."
    383     },
    384     {
    385       "title": "WildGuard: Open one-stop moderation tools for safety risks, jailbreaks, and refusals of llms",
    386       "authors": ["Seungju Han", "Kavel Rao", "Allyson Ettinger"],
    387       "year": 2024,
    388       "arxiv_id": "2406.18495",
    389       "relevance": "Open-source LLM moderation toolkit used as an external benchmark for evaluating guardrail model generalization."
    390     },
    391     {
    392       "title": "Ignore previous prompt: Attack techniques for language models",
    393       "authors": ["Fábio Perez", "Ian Ribeiro"],
    394       "year": 2022,
    395       "arxiv_id": "2211.09527",
    396       "relevance": "Early systematic study of prompt injection attack techniques against language models."
    397     }
    398   ]
    399 }

Impressum · Datenschutz