ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20187B)


      1 {
      2   "paper": {
      3     "title": "A Critical Evaluation of Defenses against Prompt Injection Attacks",
      4     "authors": ["Yuqi Jia", "Zedian Shao", "Yupei Liu", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2505.18333"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/PIEval123/PIEval, stated in the abstract."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "They use publicly available benchmarks (OpenPromptInjection under MIT license, MMLU) and construct MMLU-PI from MMLU. Code/data repository is provided."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using 'one single NVIDIA RTX A5000 GPU with 24GB memory' but provides no requirements.txt, library versions, or environment setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-style instructions."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results (ASV, FPR, FNR, utility scores) are reported as point estimates with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims defenses are 'not as effective as previously reported' based on comparing numbers without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are contextualized with baselines. E.g., utility drops from 0.65 to 0.54 for StruQ on OpenPromptInjection (Table 1b), ASV increases from 0.04 to 0.80 under GCG (Table 2). Absolute values with baseline context provided throughout."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for sample sizes. For GCG attacks they select 50 tuples from OpenPromptInjection and 25 from MMLU-PI 'for computational efficiency' but do not justify whether this is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures reported across any experiments. All results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Undefended LLMs serve as baselines (Llama-3-8B-Instruct, Llama-3-8B-undefended) and results are compared against original defense papers' claims."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Defenses evaluated are recent (StruQ 2025, SecAlign 2024, Instruction Hierarchy 2024, PromptGuard 2024, Attention Tracker 2024)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "For adaptive attacks on Attention Tracker, they ablate what components are optimized: separator only, separator+instruction, separator+instruction+data (Table 3b)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: ASV for effectiveness, absolute utility, relative utility (win rate), FPR, FNR, and AUC for detection-based defenses."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a security evaluation paper measuring attack success rates and defense effectiveness via automated metrics. Human evaluation of outputs is not relevant to the claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "They use separate benchmarks (OpenPromptInjection, MMLU-PI) from those used by the original defense papers, providing independent evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down per defense (StruQ, SecAlign, Instruction Hierarchy, PromptGuard, Attention Tracker), per benchmark (OpenPromptInjection, MMLU-PI), and per attack type (existing vs adaptive, Combined vs GCG)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The entire paper is about demonstrating failure cases of existing defenses. They discuss where adaptive Combined Attack does not improve over existing (Section 5.1) due to long-context issues."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "They report that adaptive Combined Attack does not always outperform existing attacks (Section 5.1), and that Llama-3-8B base models show smaller utility drops than Instruct models (Appendix A)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims that 'existing defenses are not as successful as previously reported' are supported by Tables 1-5 showing utility degradation and high ASVs under diverse evaluation."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims are limited and well-supported. E.g., adaptive attacks using embedding-similar tokens cause higher ASV (controlled manipulation). The ablation of optimizable components in Table 3b supports causal reasoning about attack strategies."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper evaluates only Llama-3-8B variants and GPT-4o-mini but makes broad claims about 'existing defenses' in general. The title and abstract suggest comprehensive coverage but only 4 defenses are tested on limited model families."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations. For example, utility drops could be partially due to evaluation methodology differences rather than inherent defense weakness. The paper does not consider that the original evaluations may have had valid reasons for their benchmark choices."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are named (Llama-3-8B-Instruct, GPT-4o-mini, Qwen2-1.5B-Instruct) but no specific version snapshots, API dates, or checkpoint identifiers are provided. GPT-4o-mini and GPT-4-Turbo are used without version/snapshot dates."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompt structures formally (concatenation of instruction, data, separator) but does not provide actual prompt text used in experiments."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Only α=0.01 for the adaptive attack loss balance is reported. GCG optimization hyperparameters, temperature settings for LLM inference, and other critical parameters are not stated."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a direct attack/defense evaluation study."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5 describes how benchmarks are constructed: OpenPromptInjection has 700 prompt-response pairs (100 per task), T contains 4,900 tuples. MMLU-PI construction from MMLU test split is described with exact counts (200 pairs, 1,000 tuples)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit scope boundaries stated. The paper does not clarify what the results do NOT show or what settings were not tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Code and data are available at the GitHub repository. The benchmarks used (OpenPromptInjection, MMLU) are publicly available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 5 describes how benchmarks were constructed, including sampling procedures and dataset sizes for both OpenPromptInjection and MMLU-PI."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. All experiments use automated benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from benchmark datasets to evaluation tuples (pt, rt, pe, re) is formally specified with equations and exact counts at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations clearly listed: Duke University, Penn State, UC Berkeley."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper tests defense effectiveness against attacks, not model knowledge on benchmarks. Contamination of training data with benchmark content is not relevant to the claims."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The study evaluates attack/defense dynamics, not model capability on knowledge benchmarks. Train/test overlap is not a concern for the claims made."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same rationale: the paper evaluates defenses against prompt injection, not model knowledge. Benchmark contamination is irrelevant."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or wall-clock times reported despite running extensive experiments with GCG optimization and multiple LLMs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Only the GPU type is mentioned (NVIDIA RTX A5000 24GB). No total compute budget, GPU hours, or training/inference time reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "StruQ and SecAlign lead to a loss of both relative and absolute utility, contrary to original claims",
    286       "evidence": "Table 1: StruQ achieves only 21.60% win rate vs undefended model; absolute utility drops 0.11 and 0.10 on two benchmarks. SecAlign drops 0.17 and 0.11.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "StruQ and SecAlign are not as effective against existing attacks as previously reported when evaluated on diverse prompts",
    291       "evidence": "Table 2: GCG achieves ASVs exceeding 0.80 against StruQ vs original reported 0.04; SecAlign shows 0.72 ASV on MMLU-PI vs original reported 0.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Instruction Hierarchy (deployed on GPT-4o-mini) is not effective against diverse injected prompts",
    296       "evidence": "Section 5.2: Combined Attack achieves ASVs of 0.68 and 0.75 on OpenPromptInjection and MMLU-PI respectively.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "AUC alone is insufficient for assessing detection-based defenses",
    301       "evidence": "Table 3a: PromptGuard has AUC 0.92 but FPR 0.89; Attention Tracker has AUC 1.00 but FNR 0.69 on MMLU-PI.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Adaptive attacks significantly undermine detection-based defenses",
    306       "evidence": "Table 3b: Adaptive attacks raise Attention Tracker FNR from 0.00 to 0.66-1.00 depending on strategy and benchmark.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "Existing prompt injection defenses (StruQ, SecAlign, Instruction Hierarchy, PromptGuard, Attention Tracker) are not as effective as originally claimed when evaluated with diverse target/injected prompts and adaptive attacks. Prevention-based defenses show significant utility degradation when measured with absolute utility metrics rather than only relative win rates. Detection-based defenses that appear strong by AUC metrics show high false positive or false negative rates in practice. Adaptive attacks, particularly optimization-based ones like GCG, can dramatically increase attack success values against all tested defenses.",
    312   "red_flags": [
    313     {
    314       "flag": "No uncertainty quantification",
    315       "detail": "All results are single point estimates with no error bars, confidence intervals, or variance across runs. GCG optimization is stochastic, so single-run results may not be representative."
    316     },
    317     {
    318       "flag": "No limitations section",
    319       "detail": "The paper has no limitations or threats-to-validity discussion despite making strong claims about the inadequacy of prior defense evaluations."
    320     },
    321     {
    322       "flag": "Small GCG evaluation subset",
    323       "detail": "GCG attacks are evaluated on only 50 tuples (OpenPromptInjection) and 25 tuples (MMLU-PI) 'for computational efficiency' without justification that this is representative."
    324     },
    325     {
    326       "flag": "Limited model coverage for broad claims",
    327       "detail": "Only Llama-3-8B variants and GPT-4o-mini are tested, but claims extend to defenses generally. Different model architectures or sizes could yield different results."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Struq: Defending against prompt injection with structured queries",
    333       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    334       "year": 2025,
    335       "relevance": "Prevention-based defense against prompt injection evaluated in this paper."
    336     },
    337     {
    338       "title": "Aligning llms to be robust against prompt injection",
    339       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    340       "year": 2024,
    341       "arxiv_id": "2410.05451",
    342       "relevance": "SecAlign defense using DPO fine-tuning for prompt injection resistance, evaluated in this paper."
    343     },
    344     {
    345       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    346       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    347       "year": 2024,
    348       "arxiv_id": "2404.13208",
    349       "relevance": "Instruction hierarchy defense deployed on GPT-4o-mini, shown to be ineffective against diverse attacks."
    350     },
    351     {
    352       "title": "Attention tracker: Detecting prompt injection attacks in llms",
    353       "authors": ["Kuo-Han Hung", "Ching-Yun Ko", "Ambrish Rawat", "I-Hsin Chung", "Winston H. Hsu", "Pin-Yu Chen"],
    354       "year": 2024,
    355       "arxiv_id": "2411.00348",
    356       "relevance": "Detection-based defense shown vulnerable to adaptive attacks in this evaluation."
    357     },
    358     {
    359       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    360       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    361       "year": 2024,
    362       "relevance": "OpenPromptInjection benchmark used as primary evaluation framework in this paper."
    363     },
    364     {
    365       "title": "Obfuscated gradients give a false sense of security: Circumventing defenses to adversarial examples",
    366       "authors": ["Anish Athalye", "Nicholas Carlini", "David Wagner"],
    367       "year": 2018,
    368       "relevance": "Foundational work on adaptive attacks against adversarial example defenses, motivating the adaptive attack methodology used here."
    369     },
    370     {
    371       "title": "Universal and transferable adversarial attacks on aligned language models",
    372       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    373       "year": 2023,
    374       "arxiv_id": "2307.15043",
    375       "relevance": "GCG attack method used as the primary optimization-based attack in this evaluation."
    376     },
    377     {
    378       "title": "Automatic and universal prompt injection attacks against large language models",
    379       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    380       "year": 2024,
    381       "arxiv_id": "2403.04957",
    382       "relevance": "Optimization-based prompt injection attack method relevant to the attack taxonomy discussed."
    383     },
    384     {
    385       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    386       "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    387       "year": 2025,
    388       "relevance": "Game-theoretic approach to prompt injection detection from the same research group."
    389     },
    390     {
    391       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    392       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    393       "year": 2023,
    394       "arxiv_id": "2312.14197",
    395       "relevance": "Benchmark and defense study for indirect prompt injection attacks."
    396     }
    397   ]
    398 }

Impressum · Datenschutz