scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25038B)
      1 {
      2   "paper": {
      3     "title": "Attention Tracker: Detecting Prompt Injection Attacks in LLMs",
      4     "authors": [
      5       "Kuo-Han Hung",
      6       "Ching-Yun Ko",
      7       "Ambrish Rawat",
      8       "I-Hsin Chung",
      9       "Winston H. Hsu",
     10       "Pin-Yu Chen"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv",
     14     "arxiv_id": "2411.00348"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "A project page is provided at https://huggingface.co/spaces/TrustSafeAI/Attention-Tracker, referenced in the abstract. This is a working URL to a HuggingFace space."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper uses two publicly available datasets: Open-Prompt-Injection (Liu et al., 2024b) and deepset prompt injection dataset (deepset, 2023, hosted on HuggingFace). Both are public benchmarks that were not modified."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Appendix A.4 mentions 'PyTorch and an NVIDIA RTX 3090' but does not provide a requirements.txt, Dockerfile, or detailed library versions. This is insufficient for environment recreation."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While the algorithm is described in Algorithm 1 and the appendices detail dataset and baseline settings, there are no step-by-step reproduction instructions, README, or runnable scripts provided in the paper."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Table 1 reports ± notation for non-deterministic methods (e.g., '0.52±0.03' for LLM-based detection). Deterministic methods (Protect AI, Prompt-Guard, Attention Tracker) are noted as deterministic, explaining the absence of error bars for those."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper claims Attention Tracker outperforms baselines ('AUROC improvement of up to 10.0%') but does not report any statistical significance tests such as p-values or confidence intervals for the performance differences."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports absolute AUROC values with baselines (e.g., Table 1 shows Attention Tracker at 1.00 vs Protect AI at 0.69 on Qwen2 Open-Prompt-Injection), providing enough context to assess the magnitude of improvement."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 2,000 testing queries for Open-Prompt-Injection and the deepset test set, and 30 LLM-generated sentences for head selection, but provides no justification for these sample sizes or discussion of statistical power."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 1 states 'The results were run five times using different seeds' and reports standard deviations (± values) for non-deterministic methods. For deterministic methods, the single-run nature is explicitly noted."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Section 5.1 lists four baselines: Protect AI detector, Prompt-Guard (trained detectors), LLM-based Detection, and Known-answer Detection (training-free methods). All are compared in Table 1."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The baselines include contemporary methods: Prompt-Guard (Meta, 2024), Protect AI detector (2024), and methods from Liu et al. (2024b). These are recent and represent the state of the art in prompt injection detection."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Section 5.4 contains ablation studies: Table 2 examines the number of selected heads (varying k from 0 to 5), and Figure 6 analyzes the impact of data length proportion on the focus score."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "The paper reports only AUROC as the detection performance metric. No other metrics such as F1, precision, recall, or accuracy at specific thresholds are reported."
     86       },
     87       "human_evaluation": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is a detection method evaluated on automated benchmarks. Human evaluation is not relevant to the claims about detection accuracy."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The method uses only 30 LLM-generated sentences for head selection (Section 4.1) and evaluates on entirely separate benchmarks (Open-Prompt-Injection and deepset test set). There is clear separation between the head selection data and evaluation data."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down per model (5 models) and per dataset (2 datasets) in Table 1. The appendix (A.2) describes the five tasks used. Figure 3 breaks down results by attack type."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "The paper does not discuss specific failure cases or examples where Attention Tracker fails to detect prompt injection attacks. The Limitation section only mentions the need for access to internal attention scores."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 2 shows that selecting too many heads (k=0, k=1) or too few heads (k=5) hurts performance, with AUROC dropping to 0.824 and 0.869 respectively. This is a genuine negative result about design choices."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims 'AUROC improvement of up to 10.0% over existing methods' which is supported by Table 1 (deepset dataset). Claims about effectiveness on small LLMs are supported by results on Qwen2-1.5B. Claims about generalization across models, datasets, and attack types are supported by the experimental results."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about the 'distraction effect' causing attention shifts during prompt injection attacks. These are supported by controlled visualizations (Figures 2, 3) comparing normal vs. attack data with specific attention head analysis, and the ablation in Table 2 validates that selecting these heads causally affects detection performance."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper claims to 'generalize effectively across diverse models, datasets, and attack types' but tests only 5 open-source models (1.5B-9B parameters), 2 datasets, and 4-5 attack types. The Limitation section acknowledges the closed-source LLM restriction but does not bound the generalization claims to the tested setting in the abstract or title."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not discuss alternative explanations for why the distraction effect occurs or whether other mechanisms besides attention shifting could explain detection performance. The Limitation section only addresses the closed-source access issue, not alternative interpretations of the results."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 5.1 specifies exact model names: 'Qwen2-1.5B-Instruct', 'Phi-3-mini-4k-instruct', 'Mistral-7B-Instruct-v0.3', 'Meta-Llama-3-8B-Instruct', and 'Gemma-2-9b-it'. These are specific versioned model identifiers."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper provides the actual prompts used: the LLM-based detection prompt (Appendix A.3, Listing 1), the Known-answer detection prompt (Listing 2), and the head selection instruction ('Say {random word}') with the attack string format in Algorithm 1 and Appendix A.6."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The key hyperparameter k=4 is reported and justified in Section 4.1 and ablated in Table 2. The number of generated sentences (30) is stated. However, LLM inference parameters like temperature are not explicitly stated for the evaluated models."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. Attention Tracker is a detection method that operates on attention scores during a single forward pass."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix A.2 describes the dataset construction: 5 tasks, 50 target-inject data pairs each, 4 attack types, totaling 2,000 queries. Appendix A.3 explains baseline configurations. Section 5.1 describes how instruction/data are placed in system/user prompts for each model."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "A dedicated 'Limitation' section appears after the Conclusion (Section 6), discussing the reliance on internal attention scores and the restriction to open-source models."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The Limitation section identifies a specific threat: 'reliance on internal information from LLMs, such as attention scores, during inference for attack detection. For closed-source LLMs, only model developers typically have access to this internal information.' This is specific to this study, not a generic disclaimer."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what was not tested. It does not bound claims to specific model sizes, attack sophistication levels, or acknowledge that only open-source models in the 1.5B-9B range were tested. The limitation section only addresses closed-source access, not the scope of the empirical evaluation."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Both evaluation datasets are publicly available: Open-Prompt-Injection (Liu et al., 2024b, USENIX Security) and deepset prompt injection dataset (hosted on HuggingFace at deepset/prompt-injections). The HuggingFace space also provides access to the method."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Appendix A.2 describes the dataset composition: 5 tasks with 50 target-inject pairs and 4 attack types for Open-Prompt-Injection. Appendix A.6 describes the LLM-generated dataset for head selection (30 GPT-4 generated sentences with ignore attack appended)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants are involved. The data sources are standard public benchmarks and LLM-generated sentences."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented: Section 4.1 and Appendix A.6 describe how head selection data is generated, Algorithm 1 describes the full detection pipeline from input to output, and Appendix A.2-A.3 describe evaluation data preparation."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The Acknowledgement section thanks the NTU Overseas Internship Program and IBM researchers but does not disclose specific funding sources or grants."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly stated: '1IBM Research, 2National Taiwan University'. The work was done at IBM Thomas J. Watson Research Center."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "IBM Research, where most authors are affiliated, is not directly evaluated in this paper. The evaluated models are Qwen2, Phi-3, Mistral, Llama-3, and Gemma-2 — none are IBM products. IBM has no financial stake in the specific outcome of these comparisons."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement or financial disclosure is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate a pre-trained model's knowledge on a benchmark. It evaluates a detection method that operates on attention patterns during inference. The benchmarks test the detection method, not the LLM's learned knowledge."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Contamination is not relevant because the detection method does not depend on the LLM having or not having seen the benchmark data. The method detects attention pattern shifts, not knowledge-based answers."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Same as above: the detection task is about attention pattern analysis, not about whether the model can answer questions from training data. Contamination of the underlying model does not affect the detection task."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants are involved in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants are involved in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "The paper explicitly notes that detection is obtained 'for free' during LLM inference (Section 4.2): 'the focus score FS is obtained directly during the LLM inference of the test query, making the detection cost negligible compared to the original inference cost.' Appendix A.4 states 'Each run of our method on a single model through two datasets took about one hour to evaluate.'"
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Appendix A.4 states: 'We conducted all experiments using PyTorch and an NVIDIA RTX 3090. Each run of our method on a single model through two datasets took about one hour to evaluate.' This provides hardware and wall-clock time."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Attention Tracker achieves AUROC improvement of up to 10.0% over all existing detection methods on the deepset dataset.",
    293       "evidence": "Table 1 shows Attention Tracker achieves 0.99 AUROC on Llama3/deepset while the best baseline (LLM-based) achieves 0.92, and on Mistral/deepset Attention Tracker gets 0.99 vs Protect AI at 0.90.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Attention Tracker achieves up to 31.3% average AUROC improvement over all existing training-free detection methods on the Open-Prompt-Injection benchmark.",
    298       "evidence": "Table 1 shows training-free methods (LLM-based and Known-answer) perform substantially worse across models, with LLM-based achieving as low as 0.52 on Qwen2 vs Attention Tracker at 1.00.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "The distraction effect generalizes across different datasets and attack types.",
    303       "evidence": "Figure 5 shows consistent patterns of important heads across three datasets (deepset, Open-Prompt-Injection, LLM-generated). Figure 3 shows the distraction effect correlates with attack success rate across different attack strategies.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Attention Tracker is effective even on small LLMs with only 1.5 billion parameters.",
    308       "evidence": "Table 1 shows Qwen2-1.5B achieves AUROC of 1.00 on Open-Prompt-Injection and 0.98 on deepset, outperforming all baselines.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Only approximately 0.3% of attention heads (k=4) are needed for optimal detection performance.",
    313       "evidence": "Table 2 shows k=4 achieves the best AUROC of 0.986 with only 0.3% of heads selected, while using all heads gives only 0.821.",
    314       "supported": "strong"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "The paper introduces the 'distraction effect,' showing that prompt injection attacks cause specific attention heads in LLMs to shift focus from original instructions to injected instructions. Based on this finding, Attention Tracker detects prompt injection attacks by monitoring attention patterns during inference without additional computation, achieving AUROC up to 10% higher than existing methods across 5 models and 2 datasets. The method works with as few as 0.3% of attention heads and is effective even on small models (1.5B parameters), though it requires access to internal model attention scores.",
    321   "red_flags": [
    322     {
    323       "flag": "Near-perfect AUROC scores",
    324       "detail": "Attention Tracker achieves AUROC of 0.99-1.00 across nearly all model-dataset combinations in Table 1, which is unusually high. While the paper is transparent about the evaluation, the near-perfect results warrant scrutiny about whether the attack benchmarks are sufficiently challenging or adversarially robust."
    325     },
    326     {
    327       "flag": "Limited attack diversity",
    328       "detail": "The evaluation covers only template-based prompt injection attacks (naive, escape, ignore, fake complete, combine). More sophisticated adaptive attacks that might specifically try to preserve normal attention patterns are not evaluated."
    329     },
    330     {
    331       "flag": "Single evaluation metric",
    332       "detail": "Only AUROC is reported. Metrics like precision, recall, F1, or accuracy at practical operating points (specific thresholds) are absent, making it hard to assess real-world deployment characteristics."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    338       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    339       "year": 2024,
    340       "relevance": "Defines the formal framework for prompt injection attacks and provides the Open-Prompt-Injection benchmark used in this paper's evaluation."
    341     },
    342     {
    343       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    344       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    345       "year": 2023,
    346       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, demonstrating real-world attack surfaces."
    347     },
    348     {
    349       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    350       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    351       "year": 2024,
    352       "arxiv_id": "2404.13208",
    353       "relevance": "Proposes a training-based defense against prompt injection by teaching LLMs to prioritize system-level instructions over user inputs."
    354     },
    355     {
    356       "title": "Prompt injection attacks in defended systems",
    357       "authors": ["Daniil Khomsky", "Narek Maloyan", "Bulat Nutfullin"],
    358       "year": 2024,
    359       "arxiv_id": "2406.14048",
    360       "relevance": "Evaluates prompt injection attacks against systems with defense mechanisms, revealing limitations of current defense strategies."
    361     },
    362     {
    363       "title": "Are you still on track!? catching llm task drift with activations",
    364       "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"],
    365       "year": 2024,
    366       "arxiv_id": "2406.00799",
    367       "relevance": "Related approach to detecting prompt injection attacks using LLM activation patterns, training a classifier on activation distributions."
    368     },
    369     {
    370       "title": "Struq: Defending against prompt injection with structured queries",
    371       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    372       "year": 2024,
    373       "arxiv_id": "2402.06363",
    374       "relevance": "Proposes a defense method against prompt injection using structured query formats to separate instructions from data."
    375     },
    376     {
    377       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    378       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    379       "year": 2024,
    380       "relevance": "Defense against prompt injection through task-specific fine-tuning of LLMs to make them more robust."
    381     },
    382     {
    383       "title": "Lookback lens: Detecting and mitigating contextual hallucinations in large language models using only attention maps",
    384       "authors": ["Yung-Sung Chuang", "Linlu Qiu", "Cheng-Yu Hsieh", "Ranjay Krishna", "Yoon Kim", "James Glass"],
    385       "year": 2024,
    386       "arxiv_id": "2407.07071",
    387       "relevance": "Related work using attention maps for detecting undesirable LLM behavior (hallucinations), similar methodological approach to Attention Tracker."
    388     },
    389     {
    390       "title": "Tensor trust: Interpretable prompt injection attacks from an online game",
    391       "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes", "Justin Svegliato", "Luke Bailey"],
    392       "year": 2024,
    393       "relevance": "Provides a large-scale dataset and evaluation of prompt injection attacks through a gamified approach."
    394     },
    395     {
    396       "title": "Automatic and universal prompt injection attacks against large language models",
    397       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    398       "year": 2024,
    399       "arxiv_id": "2403.04957",
    400       "relevance": "Develops optimization-based universal prompt injection attacks, representing a more sophisticated attack category than template-based methods."
    401     },
    402     {
    403       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    404       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    405       "year": 2024,
    406       "arxiv_id": "2403.03792",
    407       "relevance": "Uses differentiable search methods to generate prompt injection triggers, relevant to evaluating robustness of detection methods."
    408     },
    409     {
    410       "title": "Ignore previous prompt: Attack techniques for language models",
    411       "authors": ["Fábio Perez", "Ian Ribeiro"],
    412       "year": 2022,
    413       "arxiv_id": "2211.09527",
    414       "relevance": "Foundational work cataloging prompt injection attack techniques for language models."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs