ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31992B)


      1 {
      2   "paper": {
      3     "title": "Mitigating Indirect Prompt Injection via Instruction-Following Intent Analysis",
      4     "authors": [
      5       "Mintong Kang",
      6       "Chong Xiang",
      7       "Sanjay Kariyappa",
      8       "Chaowei Xiao",
      9       "Bo Li",
     10       "Edward Suh"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv preprint",
     14     "arxiv_id": "2512.00966",
     15     "doi": "10.48550/arXiv.2512.00966"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "IntentGuard, a defense framework that uses instruction-following intent analysis to detect and mitigate indirect prompt injection, achieves zero false positives on benign inputs while reducing attack success rates by over 90% under strong adaptive attacks (e.g., 100% to 8.5% on Mind2Web with PAIR attacks and Qwen3-32B). The approach works by extracting the LLM's intended instructions via 'thinking intervention' strategies on reasoning-enabled models, then tracing each instruction's origin to flag those from untrusted data segments. The defense consistently outperforms training-free baselines (SEP Defense, PromptArmor) across two benchmarks and two models.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. No mention of code release plans."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses publicly available benchmarks: AgentDojo (Debenedetti et al., 2024) and Mind2Web (Deng et al., 2023) with AdvAgent (Xu et al., 2024). Both are standard public datasets."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency listings are provided. The paper mentions model names but no runtime environment details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While the method is described algorithmically and prompts are given in the appendix, there are no concrete reproduction steps."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 1 and 2 report only point estimates (e.g., Utility 0.773, ASR 0.468) with no confidence intervals, error bars, or ± notation."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims IntentGuard outperforms baselines across multiple settings but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on comparing point estimates."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper consistently reports both baseline and treated values, enabling effect size assessment: 'ASR drops from 100.0% (vanilla) to 8.5% (IntentGuard)' and 'from 72.6% to 10.9%' with full comparison tables showing absolute values for all methods."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for benchmark sizes (97 tasks/629 adversarial cases for AgentDojo; 440 tasks for Mind2Web). These are standard benchmarks used as-is, but no power analysis or sample size rationale is discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Three baselines are compared: (i) Vanilla (unprotected model), (ii) SEP Defense (Zverev et al., 2024), and (iii) PromptArmor (Shi et al., 2025). Results for all methods are shown in Tables 1 and 2."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "SEP Defense (2024) and PromptArmor (2025) are recent, representative training-free defenses. The paper notes these 'are representative training-free defenses and thus enable fair comparison with IntentGuard' (Section 4.1)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Figure 3a ablates thinking intervention components (system prompt only, +start-of-thinking, +end-of-thinking, +both). Figure 3b ablates in-context demonstration strategies (format specification, conflict reasoning, adversarial reasoning). Table 3 compares sparse vs. dense embeddings."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are reported: Utility (task performance), ASR (attack success rate), and FPR (false positive rate). Tables 1-2 report all three."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is included. All evaluation is automated through the AgentDojo and Mind2Web benchmark frameworks."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are reported on established benchmark test sets: AgentDojo's 629 adversarial test cases and Mind2Web's 440 tasks. These benchmarks have predefined evaluation protocols."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by attack type (Template, Beam Search, GCG, PAIR) across both benchmarks and both models in Tables 1-2. Ablation studies provide additional breakdowns by component configuration."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Figure 4 presents a confusion matrix analyzing IIA faithfulness, identifying 10.9% truly unfaithful cases and 5.3% where the model intends to follow but fails to execute correctly. The paper also discusses a single case of utility degradation (Mind2Web, Qwen3-32B: 0.840→0.810)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports utility degradation in one setting (Mind2Web with Qwen3-32B, 0.840→0.810) and shows in Figure 3a that individual thinking interventions alone are less effective than their combination. The IIA faithfulness analysis reveals 10.9% unfaithful predictions."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of (1) 'no utility degradation in all but one setting' and (2) 'reducing attack success rates from 100% to 8.5%' are directly supported by Tables 1-2. The Mind2Web Qwen3-32B PAIR row confirms ASR drops from 100.0% to 8.5%, and utility matches vanilla in all but one cell."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims ('IntentGuard achieves...', 'reducing ASR') are supported by controlled comparisons holding model and benchmark constant while varying the defense method. Ablation studies in Figure 3 isolate the contribution of individual components through controlled single-variable manipulation."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract explicitly bounds results: 'on two agentic benchmarks (AgentDojo and Mind2Web) using two reasoning-enabled LLMs (Qwen-3-32B and gpt-oss-20B).' The title describes the method without claiming universal applicability."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for observed results. Section 5 discusses future directions but does not consider confounds such as whether the results are specific to reasoning-enabled models, whether the thinking intervention simply adds noise that disrupts attacks, or whether benchmark-specific artifacts drive the results."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures ASR (attack success rate) and Utility directly on the benchmarks. These metrics directly match the claims being made about defense effectiveness and task performance. No proxy gap exists between measurement and framing."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific models are identified: 'Qwen3-32B (Yang et al., 2025)' and 'gpt-oss-20B (Agarwal et al., 2025)' with references to their respective technical reports/model cards. These are specific model identifiers with family, version, and size."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt text is provided in Appendix A for all three thinking intervention strategies (start-of-thinking prefilling, end-of-thinking intervention, adversarial in-context demonstration). The PAIR attack prompt is also provided in full in Appendix B."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Origin Tracing hyperparameters are reported (sliding window size 1/2, stride 1/8, alert threshold 0.7, token set ratio metric). However, LLM inference hyperparameters (temperature, top-p, max tokens, sampling strategy) are not stated for either model."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "IntentGuard's three-step pipeline is described in detail: (1) Intent Extraction via thinking intervention (Section 3.4, Figure 2), (2) Origin Tracing via sliding-window embedding matching (Section 3.3), (3) Injection Mitigation with alert and recovery modes. The full workflow is illustrated in Figure 1."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No documentation of how the benchmark data was preprocessed before evaluation. The paper describes the benchmarks at a high level (Section 4.1) but does not detail how inputs were formatted, how injections were integrated, or other preprocessing steps."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Section 5 ('Discussion') discusses future research directions (training to enhance IntentGuard, alternative IIA designs, broader IIA applications) but is framed as future work rather than a substantive limitations section. There is no dedicated limitations or threats-to-validity section."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No specific threats to validity are discussed. Section 5 identifies opportunities for improvement but does not address threats such as generalization beyond reasoning-enabled models, sensitivity to prompt formatting, or potential overfitting of the evaluation to specific attack implementations."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While Section 3.1 states the focus on indirect prompt injection and footnote 1 acknowledges a limitation regarding legitimate instruction-following tasks, the paper does not systematically state what the results do NOT show (e.g., applicability to non-reasoning models, real-world deployment scenarios, other attack types)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (model outputs, per-example results, attack logs) is released for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Data sources are described: 'AgentDojo (Debenedetti et al., 2024) which provides 97 realistic multi-step tasks across four simulated environments and 629 adversarial test cases' and 'Mind2Web (Deng et al., 2023) with AdvAgent (Xu et al., 2024), a web-agent benchmark which covers 440 tasks across 4 different domains' (Section 4.1)."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are standard benchmarks (AgentDojo, Mind2Web)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The evaluation pipeline from benchmark input to reported metrics is not documented in detail. There are no descriptions of how many examples were filtered, how edge cases were handled, or how the attack optimization pipeline produced final adversarial inputs."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No formal funding acknowledgments section. The only disclosure is a footnote: 'This work was done during an internship at NVIDIA.' No grants or specific funding sources are listed."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: NVIDIA (Kang, Xiang, Kariyappa, Xiao, Suh), University of Illinois Urbana-Champaign (Kang, Li), Johns Hopkins University (Xiao). The NVIDIA internship is explicitly noted."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "NVIDIA, where the work was conducted, has commercial interests in LLM safety and security for its AI products and platforms. A strong defense framework could benefit NVIDIA's product offerings. The funder is not independent of the outcome."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper tests a defense framework (IntentGuard) rather than evaluating a pre-trained model's knowledge or capability on benchmarks. The contamination concern is not central to the claims."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper tests defenses against prompt injection rather than evaluating model knowledge. Train/test overlap is not relevant to the defense effectiveness claims."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The paper evaluates a defense mechanism, not a model's inherent capabilities. Benchmark contamination does not affect the validity of the defense evaluation."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "Table 3 reports Origin Tracing time (0.075-0.352s), and Remark 3 states computational overhead is 'minimal.' However, no overall inference cost, API costs, tokens consumed, or wall-clock time per evaluation is reported."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget is stated. The paper does not report GPU hours, total API spend, hardware used, or total evaluation time."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is not stated. It is unclear whether results are from single runs or averaged across multiple runs."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. The default parameters (window ratio 0.5, threshold 0.7) appear chosen without documented search. Table 4 shows robustness to parameter choices but is not a search budget disclosure."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The default configuration (union of start/end instruction sets, adversarial reasoning demonstration, sparse embeddings, threshold 0.7) is stated but the selection rationale is not explicit. The ablation study justifies the thinking intervention combination but not the full parameter selection process."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons across 4 attacks × 4 methods × 2 models × 2 benchmarks without any statistical testing, let alone correction for multiple comparisons."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own IntentGuard system against baselines and design their own adaptive attacks. No acknowledgment of self-comparison bias or independent evaluation is provided."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "IntentGuard adds inference overhead (thinking intervention tokens, Origin Tracing) compared to baselines, but performance is not reported as a function of compute budget. Table 3 compares tracing time for sparse vs. dense embeddings but does not provide compute-matched comparisons against baselines."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether AgentDojo and Mind2Web adequately measure real-world prompt injection defense effectiveness. The paper does not address whether synthetic multi-tool and web-agent benchmarks are valid proxies for actual deployment scenarios."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "All defense methods are compared using the same backbone models (Qwen3-32B, gpt-oss-20B) and the same benchmark evaluation frameworks. The external LLM detector in PromptArmor uses the same backbone model as IntentGuard for fair comparison (Section 4.1)."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether AgentDojo or Mind2Web tasks were available before the training cutoffs of Qwen3-32B or gpt-oss-20B, which could affect utility scores."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup could leak information to the models (e.g., through benchmark-specific patterns in the evaluation framework)."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between benchmark examples or whether structural similarities among test cases could inflate results."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is used or discussed."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "IntentGuard preserves benign utility with zero false positive alerts in all but one experiment setting.",
    372       "evidence": "Tables 1 and 2 show FPR=0.000 for IntentGuard in all settings. Utility matches vanilla in all cases except Mind2Web with Qwen3-32B (0.840→0.810).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "IntentGuard reduces PAIR attack success rate from 100% to 8.5% on Mind2Web with Qwen3-32B.",
    377       "evidence": "Table 2, Mind2Web, Qwen3-32B row: Vanilla ASR=1.000, IntentGuard ASR=0.085 under PAIR attack.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "IntentGuard substantially outperforms PromptArmor, achieving over 20% higher utility and more than 50% lower ASR under PAIR attacks on AgentDojo.",
    382       "evidence": "Table 1: Under PAIR attacks, IntentGuard Utility=0.687 vs PromptArmor=0.432 (25.5pp higher), IntentGuard ASR=0.092 vs PromptArmor=0.659 (56.7pp lower) for Qwen3-32B.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Combining start-of-thinking prefilling and end-of-thinking refinement yields the strongest robustness, driving ASR below 0.1.",
    387       "evidence": "Figure 3a shows individual interventions reduce ASR to ~0.3 and ~0.2 respectively, while their combination achieves ASR below 0.1.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The IIA is faithful for 83.8% of instructions (strictly along the confusion matrix diagonal).",
    392       "evidence": "Figure 4 shows the confusion matrix with 64.5% (No Intent, Not Followed) + 19.3% (Intent, Followed) = 83.8% on the diagonal, with only 10.9% truly unfaithful (No Intent, Followed).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Sparse embeddings match dense embeddings in robustness while being far more efficient for Origin Tracing.",
    397       "evidence": "Table 3 shows comparable ASR (0.092 vs 0.094 for Qwen3-32B) with sparse tracing time of 0.075s vs dense 0.274s.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No error bars or variance across runs",
    404       "detail": "All results in Tables 1-2 are point estimates with no uncertainty quantification. Given that LLM outputs are stochastic, single-run results may not be stable. The large number of comparisons (4 attacks × 4 methods × 2 models × 2 benchmarks) makes random variation a real concern."
    405     },
    406     {
    407       "flag": "Self-designed adaptive attacks",
    408       "detail": "The PAIR-adaptive attack and the adaptive optimization targets for Beam Search and GCG are designed by the same authors who built IntentGuard. This creates potential for unconscious bias in attack design — the authors may not fully stress-test their own defense's weaknesses. The concurrent work by Nasr et al. (2025) is referenced but their attacks are not directly evaluated."
    409     },
    410     {
    411       "flag": "NVIDIA conflict of interest not formally disclosed",
    412       "detail": "Five of six authors are affiliated with NVIDIA, and the work was done during an NVIDIA internship. NVIDIA has commercial interests in LLM safety for its AI platforms. No formal conflict of interest statement or funding acknowledgment is provided."
    413     },
    414     {
    415       "flag": "Limited model diversity",
    416       "detail": "Only two reasoning-enabled models (Qwen3-32B, gpt-oss-20B) are tested. The framework explicitly requires reasoning-enabled LLMs that generate thinking traces, but no evidence is provided for models without extended reasoning capabilities (e.g., GPT-4o, Claude, Llama)."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    422       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    423       "year": 2023,
    424       "relevance": "Foundational paper defining indirect prompt injection attacks on LLM-integrated applications."
    425     },
    426     {
    427       "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents",
    428       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    429       "year": 2024,
    430       "relevance": "Multi-tool agentic benchmark used for evaluating prompt injection attacks and defenses."
    431     },
    432     {
    433       "title": "Mind2Web: Towards a generalist agent for the web",
    434       "authors": ["Xiang Deng", "Yu Gu", "Boyuan Zheng", "Shijie Chen", "Sam Stevens", "Boshi Wang", "Huan Sun", "Yu Su"],
    435       "year": 2023,
    436       "relevance": "Web-agent benchmark used with AdvAgent for evaluating defense against prompt injection in web interaction."
    437     },
    438     {
    439       "title": "SecAlign: Defending against prompt injection with preference optimization",
    440       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    441       "year": 2024,
    442       "arxiv_id": "2410.05451",
    443       "relevance": "Finetuning-based defense against prompt injection using preference optimization, compared as alternative defense approach."
    444     },
    445     {
    446       "title": "PromptArmor: Simple yet effective prompt injection defenses",
    447       "authors": ["Tianneng Shi", "Kaijie Zhu", "Zhun Wang", "Yuqi Jia", "Will Cai"],
    448       "year": 2025,
    449       "arxiv_id": "2507.15219",
    450       "relevance": "LLM-based prompt injection detector used as a primary baseline defense in the evaluation."
    451     },
    452     {
    453       "title": "Attention Tracker: Detecting prompt injection attacks in LLMs",
    454       "authors": ["Kuo-Han Hung", "Ching-Yun Ko", "Ambrish Rawat", "I Chung", "Winston H Hsu", "Pin-Yu Chen"],
    455       "year": 2024,
    456       "arxiv_id": "2411.00348",
    457       "relevance": "Defense approach using attention-based internal signals to detect prompt injection, representing the detector-based defense category."
    458     },
    459     {
    460       "title": "Defeating prompt injections by design",
    461       "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini"],
    462       "year": 2025,
    463       "arxiv_id": "2503.18813",
    464       "relevance": "System-level defense approach enforcing rules in agentic systems to mitigate prompt injection."
    465     },
    466     {
    467       "title": "Jailbreaking black box large language models in twenty queries",
    468       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J Pappas", "Eric Wong"],
    469       "year": 2025,
    470       "relevance": "PAIR attack method used as the strongest adaptive attack baseline in the evaluation."
    471     },
    472     {
    473       "title": "Universal and transferable adversarial attacks on aligned language models",
    474       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    475       "year": 2023,
    476       "arxiv_id": "2307.15043",
    477       "relevance": "GCG attack method used as a white-box adversarial attack baseline."
    478     },
    479     {
    480       "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against LLM jailbreaks and prompt injections",
    481       "authors": ["Milad Nasr", "Nicholas Carlini", "Chawin Sitawarin"],
    482       "year": 2025,
    483       "arxiv_id": "2510.09023",
    484       "relevance": "Concurrent work proposing stronger adaptive attacks; shares similarities with IntentGuard's adaptive attack experiments."
    485     },
    486     {
    487       "title": "RL is a hammer and LLMs are nails: A simple reinforcement learning recipe for strong prompt injection",
    488       "authors": ["Yuxin Wen", "Arman Zharmagambetov", "Ivan Evtimov", "Narine Kokhlikyan", "Tom Goldstein", "Kamalika Chaudhuri", "Chuan Guo"],
    489       "year": 2025,
    490       "arxiv_id": "2510.04885",
    491       "relevance": "RL-based adaptive prompt injection attack representing a stronger attack methodology."
    492     },
    493     {
    494       "title": "AdvWeb: Controllable black-box attacks on VLM-powered web agents",
    495       "authors": ["Chejian Xu", "Mintong Kang", "Jiawei Zhang", "Zeyi Liao"],
    496       "year": 2024,
    497       "arxiv_id": "2410.17401",
    498       "relevance": "Black-box attack method targeting web agents, used with Mind2Web for evaluation."
    499     },
    500     {
    501       "title": "Meta SecAlign: A secure foundation LLM against prompt injection attacks",
    502       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"],
    503       "year": 2025,
    504       "arxiv_id": "2507.02735",
    505       "relevance": "Finetuning-based defense training a foundation model to resist prompt injection."
    506     },
    507     {
    508       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    509       "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    510       "year": 2025,
    511       "relevance": "Known-answer detection approach for prompt injection using game-theoretic formulation."
    512     },
    513     {
    514       "title": "Effectively controlling reasoning models through thinking intervention",
    515       "authors": ["Tong Wu", "Chong Xiang", "Jiachen T Wang", "G Edward Suh", "Prateek Mittal"],
    516       "year": 2025,
    517       "arxiv_id": "2503.24370",
    518       "relevance": "Introduces the thinking intervention concept that IntentGuard adapts for instruction-following intent elicitation."
    519     }
    520   ],
    521   "engagement_factors": {
    522     "practical_relevance": {
    523       "score": 2,
    524       "justification": "Proposes a concrete defense framework that practitioners building LLM agents could adopt, but requires reasoning-enabled models and custom implementation with no released code."
    525     },
    526     "surprise_contrarian": {
    527       "score": 1,
    528       "justification": "Novel perspective of analyzing instruction-following intent rather than detecting malicious text, but the overall finding that defenses can work is not surprising."
    529     },
    530     "fear_safety": {
    531       "score": 2,
    532       "justification": "Directly addresses prompt injection security threats to LLM agents, showing attacks can achieve 100% success rate without defenses."
    533     },
    534     "drama_conflict": {
    535       "score": 0,
    536       "justification": "No controversy or conflict; straightforward defense proposal and evaluation."
    537     },
    538     "demo_ability": {
    539       "score": 0,
    540       "justification": "No code, demo, or tool is released. The approach cannot be tried without reimplementation."
    541     },
    542     "brand_recognition": {
    543       "score": 2,
    544       "justification": "NVIDIA is a high-profile AI lab; the paper also evaluates models from well-known organizations."
    545     }
    546   }
    547 }

Impressum · Datenschutz