ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32819B)


      1 {
      2   "paper": {
      3     "title": "SecInfer: Preventing Prompt Injection via Inference-time Scaling",
      4     "authors": [
      5       "Yupei Liu",
      6       "Yanting Wang",
      7       "Yuqi Jia",
      8       "Jinyuan Jia",
      9       "Neil Zhenqiang Gong"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2509.24967",
     14     "doi": "10.48550/arXiv.2509.24967"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "SecInfer defends against prompt injection by sampling multiple LLM responses via diverse chain-of-thought system prompts and selecting the response most aligned with the target task using semantic clustering and an LLM-as-a-judge. Across 4 LLMs, 6 target tasks, 7 existing attacks, and 6 adaptive attacks, SecInfer reduces attack success rate to near zero while preserving task utility comparable to undefended baselines. The method outperforms both existing prevention-based defenses and prior inference-time scaling approaches, though it is less effective when the injected and target tasks are of the same type.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. The paper references open-source implementations of attacks (e.g., nano-GCG) but does not release its own SecInfer implementation."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All datasets used are publicly available standard benchmarks: AG News, MMLU, GPQA, NarrativeQA, SQuAD_v2, SimpleQA, Open-Prompt-Injection, InjecAgent, and AgentDojo. The paper did not collect proprietary data."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions using NVIDIA GH200 GPUs and specific LLMs but provides no requirements.txt, Dockerfile, or detailed environment specification listing library versions sufficient to recreate the experimental environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithms are described pseudocode-level but a researcher would need to implement the full pipeline from scratch."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 1-8 are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere in the paper."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims SecInfer 'outperforms' baselines and achieves 'higher effectiveness' throughout, but no statistical significance tests (t-tests, bootstrap, etc.) are reported. All comparisons are based on raw numerical differences."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Results are reported with baseline context enabling effect size assessment. For example, Table 1 shows ASR drops from 0.64 to 0.00 under Combined Attack on AG News, and utility preservation is shown as 87% vs 86% for GPT-4.1. Absolute differences are computable from all tables."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Section 5.1 states 100 contaminated samples per target-injected task pair but provides no justification for why 100 is sufficient and no power analysis."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No standard deviations, interquartile ranges, or variance measures are reported across any results. It is unclear whether results are from single runs or averaged over multiple runs, and no spread measure accompanies any reported number."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table 2 compares against 8 prevention-based defenses (Delimiters, Sandwich, Instructional, Paraphrasing, Retokenization, DS+PL, PromptArmor, MetaSecAlign). Table 3 compares against 6 inference-time scaling methods (ICL-2, ICL-4, CoT, IR, BoN, SC)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include very recent methods: MetaSecAlign (Chen et al., 2025c), PromptArmor (Shi et al., 2025c), DataSentinel+PromptLocate (Liu et al., 2025; Jia et al., 2026). These represent the state of the art at time of writing."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 5.4 presents detailed ablations: Step I sampling variants (Figure 3), impact of N (Figure 4), Step II aggregation variants (Figure 5), and choice of judge LLM (Table 7). Each ablation isolates specific components."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three metrics are used throughout: U (utility under no attack), UA (utility under attack), and ASR (attack success rate). These capture different aspects of defense effectiveness."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated. No human evaluation of system outputs is performed. For open-domain tasks, automated metrics (ROUGE-1, Pass@1) are used exclusively."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Standard benchmark test splits are used. Appendix E notes that ICL demonstration examples are guaranteed not to be in the evaluation set. SecInfer's system prompts are fixed (not tuned on test data)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by target task (6 datasets), attack type (7 existing + 6 adaptive), and backend LLM (4 models) in Tables 1, 2, 3, and 8. InjecAgent results are broken down by attack variant and harm type (Table 5)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 provides a detailed discussion of when SecInfer fails: same-type target/injected tasks with concrete examples (sentiment analysis). Table 12 quantifies the failure (ASR 0.90→0.79 for SST2). Adaptive Attack V is shown to degrade UA substantially."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 12 shows SecInfer achieves only marginal ASR reduction for same-type tasks (0.90→0.79). Table 8 shows adaptive attacks V and VI reduce UA significantly. Section 6 openly discusses these limitations."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims SecInfer 'effectively mitigates both existing and adaptive prompt injection attacks, outperforming state-of-the-art defenses as well as existing inference-time scaling approaches.' Tables 1-3 and 8 support all three claims with near-zero ASR across conditions and superior UA compared to baselines."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims about component contributions are supported by controlled ablation studies in Section 5.4: each Step I variant and Step II variant is tested in isolation (Figures 3, 5). The ablation design uses single-variable manipulation, adequate for the causal claims made."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper tests across 4 LLMs, 6 target tasks, 8 injected tasks, 13 attacks, and 2 agent benchmarks, which is extensive. Section 6 explicitly bounds generalization by identifying same-type task limitations. Claims are generally well-matched to the breadth of testing."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 6 conducts a memorization test to rule out the alternative explanation that the judge LLM memorized benchmark answers rather than genuinely evaluating responses. Over 94% of samples had similarity ratios below 0.6, ruling out this confound."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper's claims match the granularity of its measurements. ASR directly measures whether the model produces the attacker-desired response. U and UA directly measure task performance using standard metrics (accuracy, ROUGE-1, Pass@1). No proxy gap exists between measurements and framing."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Open-weight models are specified precisely (LLaMA3.1-8B-Instruct, Qwen3-8B). However, closed-source models are identified only by marketing names: 'GPT-4o' and 'GPT-4.1' without snapshot dates. Appendix C mentions Azure API version '2024-12-01-preview' but this is the API version, not the model snapshot."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix A provides all 5 system prompts verbatim. Figure 7 shows the full judge instruction. Appendix G provides agent-specific system prompts. Appendix H provides adaptive attack prompts. The actual prompt text is reproducible."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Appendix C specifies temperature, max output tokens, and top-k for each model: LLaMA (temp=0.1, max=150, top-k=20), Qwen3 (temp=0.6, max=32768, top-k=20), GPT-4o/4.1 (temp=0.7, max=500, top-k=20)."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "SecInfer's two-step pipeline is described in full algorithmic detail (Algorithms 1-3), including system-prompt-guided sampling with CoT prompts and target-task-guided aggregation using semantic clustering (Agglomerative Clustering, all-MiniLM-L6-v2 embeddings) and LLM-as-a-judge."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5.1 describes how contaminated data is constructed for each attack type. Appendix D details attack implementations. Appendix B describes the mapping function for closed-domain tasks. Dataset sources and how injected prompts are integrated are clearly specified."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Discussion and Limitations' is a dedicated section with substantive discussion of two specific limitations: same-type task failure and a memorization test."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 discusses specific threats: (1) SecInfer fails when injected and target tasks are the same type, with quantitative evidence from Table 12; (2) the memorization concern is tested with string similarity analysis showing >94% of samples below 0.6 similarity. These are specific to this study."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 explicitly states what SecInfer does NOT handle: same-type target/injected tasks, where 'prompt injection in this scenario reduces to traditional adversarial examples.' Section 5.5 notes Adaptive Attack V defeats utility preservation when the entire data is modified. Footnote 1 in Section 4.1 also flags the same-type limitation."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw experimental data (individual model outputs, per-sample results, intermediate clustering outputs) is released. Only aggregated metrics in tables are provided."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 5.1 describes all datasets used, their sources, and how contaminated samples are generated (100 per target-injected task pair). Attack construction details are in Section 2.2 and Appendix D."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in this study. All experiments use automated evaluation on standard benchmarks."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline from data preparation (selecting target/injected tasks, constructing contaminated samples) through SecInfer's two steps (sampling N responses, clustering, judge selection) to metric computation is documented in Sections 4-5 and Appendices B-F."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding acknowledgment or grant information appears anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Penn State University (Liu, Wang, Jia J.) and Duke University (Jia Y., Gong). Authors are academic researchers not affiliated with the companies whose models they evaluate."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Since funding is not disclosed, independence cannot be assessed. The absence of funding disclosure prevents evaluation of this criterion."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest disclosure appears in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the four LLMs used (LLaMA3.1-8B-Instruct, Qwen3-8B, GPT-4o, GPT-4.1). This is relevant because utility metrics (U) are measured on standard benchmarks that may be in training data."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of potential overlap between LLM training data and the benchmark test sets (AG News, MMLU, GPQA, etc.) used for utility evaluation. The memorization test in Section 6 only addresses the judge LLM, not the backend LLMs."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Several benchmarks used (AG News from 2020, MMLU from 2021, SQuAD_v2 from 2018) predate all models' training periods and are widely available online, creating high contamination risk for utility measurements. This is not discussed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Table 4 reports average inference time per sample for all methods. SecInfer takes 0.612 seconds. The paper also notes parallelization across 5 GH200 GPUs reduces this to 0.221s (Section 5.2)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Per-sample inference time is reported (Table 4) but the total computational budget for all experiments (total GPU hours, total API spend for GPT-4o/4.1 experiments) is not stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No random seed sensitivity analysis is reported. The paper does not mention running experiments with multiple seeds or analyzing seed-dependent variability in results."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of contaminated samples (100 per target-injected task pair) is stated, but the number of independent experimental runs per configuration is not specified. It is unclear if results are from single runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Default hyperparameters (N=5, specific temperatures, top-k values) are used without describing how they were selected. The ablation study explores alternatives post-hoc but no systematic search budget is reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Section 5.4 provides ablation studies justifying the default configuration: Figure 3 validates the combined sampling strategy, Figure 4 shows N=5 is sufficient, Figure 5 validates LLM-as-a-judge, and Table 7 evaluates judge LLM choices. These are performed on SQuAD_v2/SST2, separate from the main evaluation."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement baselines following existing works (Appendix E) but do not acknowledge or discuss the potential bias of implementing and evaluating their own system against their own baseline implementations."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Table 4 reports inference time for all methods, enabling compute-matched comparisons. Figure 4 explicitly shows performance (U, UA, ASR) as a function of N alongside computation time. SecInfer's overhead (0.612s) is compared to BoN (0.719s) and SC (0.622s)."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether the benchmark tasks (AG News classification, MMLU, etc.) are valid proxies for real-world prompt injection scenarios like AI Overviews or Review Highlights, despite motivating the work with these applications."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "When comparing backend LLMs in Table 1, all use the same SecInfer method. When comparing defenses in Tables 2-3, the same backend LLM (LLaMA3.1-8B-Instruct) is used throughout. The experimental design controls for scaffold confounds."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of temporal leakage. Benchmark datasets (AG News 2020, MMLU 2021, SQuAD_v2 2018) were created years before the models' training periods, meaning models may have seen solutions during training."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of feature leakage in the evaluation setup."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of potential non-independence between training and evaluation data for the LLMs used."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Section 6 applies a memorization test following Staab et al. (2024) methodology on the judge LLM, measuring string similarity between outputs and ground-truth answers. Over 94% of samples had similarity ratios below 0.6, providing concrete leakage detection evidence for the judge LLM."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "SecInfer reduces ASR to nearly zero across all tested attacks, LLMs, and datasets.",
    371       "evidence": "Table 1(b) shows ASR of 0.00 across nearly all combinations of 4 LLMs, 6 target tasks, and 7 attacks (with minor exceptions of 0.01-0.02 for GCG attack on some tasks).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "SecInfer preserves task utility (U) comparable to the undefended baseline.",
    376       "evidence": "Table 1 shows U with SecInfer is within 1-2 percentage points of no-defense U for most LLM-task combinations (e.g., 0.87 vs 0.86 for GPT-4.1 on AG News).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "SecInfer outperforms all existing prevention-based defenses.",
    381       "evidence": "Table 2 shows SecInfer achieves 0.00 ASR under Combined Attack while most baselines have ASR 0.37-0.86. MetaSecAlign matches on CA but fails under GCG (ASR 0.25-0.63). DS+PL also shows non-negligible ASR under GCG for some tasks.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "SecInfer outperforms prior inference-time scaling methods against prompt injection.",
    386       "evidence": "Table 3 shows all prior methods (ICL, CoT, IR, BoN, SC) have substantially higher ASR (0.06-0.45) and lower UA compared to SecInfer (0.00 ASR) under Combined Attack.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "SecInfer is robust against six types of adaptive attacks specifically targeting its components.",
    391       "evidence": "Table 8(b) shows ASR remains near zero (0.00-0.06) across all six adaptive attacks when SecInfer is deployed, compared to high ASR without defense (Table 8(a)). However, UA degrades notably under Adaptive Attacks V and VI.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "SecInfer is effective in LLM agent settings (InjecAgent and AgentDojo).",
    396       "evidence": "Table 5 shows ASR drops from 0.37-0.68 to 0.00-0.02 on InjecAgent. Table 6 shows ASR drops from 0.21 to 0.02 on AgentDojo while utility is maintained (0.40→0.39 U).",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "SecInfer is less effective when the injected and target tasks are of the same type.",
    401       "evidence": "Table 12 shows ASR only drops from 0.90 to 0.79 for SST2 and from 0.38 to 0.28 for Gigaword when target and injected tasks match. Section 6 explains this reduces to traditional adversarial examples.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "The judge LLM's effectiveness does not stem from memorization of target task datasets.",
    406       "evidence": "Section 6 memorization test shows over 94% of samples have string similarity ratios below 0.6 between judge outputs and ground-truth answers, following Staab et al. (2024) methodology.",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "red_flags": [
    411     {
    412       "flag": "No uncertainty quantification",
    413       "detail": "All results across all tables (1-8, 12) are reported as single point estimates with no error bars, confidence intervals, standard deviations, or variance measures. Given that SecInfer involves stochastic sampling (temperature sampling, random system prompt selection), results could vary meaningfully across runs, making it impossible to assess the reliability of reported differences."
    414     },
    415     {
    416       "flag": "No code release",
    417       "detail": "Despite the algorithmic complexity of SecInfer (two-step pipeline with semantic clustering, multiple system prompts, judge LLM integration), no source code is released, significantly limiting reproducibility and independent verification of results."
    418     },
    419     {
    420       "flag": "Benchmark contamination risk for utility metrics",
    421       "detail": "Several benchmarks used for utility evaluation (AG News 2020, MMLU 2021, SQuAD_v2 2018) predate all model training periods. High U scores on these benchmarks may partly reflect training data memorization rather than genuine task competence. The memorization test in Section 6 only addresses the judge LLM, not the backend LLMs."
    422     },
    423     {
    424       "flag": "Construct validity gap between evaluation and motivation",
    425       "detail": "The paper motivates SecInfer with real-world applications (AI Overviews, Review Highlights) but evaluates exclusively on academic benchmarks (AG News classification, MMLU multiple-choice). No evaluation is conducted on the motivating use cases, leaving a gap between claimed applicability and demonstrated effectiveness."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    431       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    432       "year": 2024,
    433       "relevance": "Foundational benchmark (Open-Prompt-Injection) for evaluating prompt injection attacks and defenses, directly used in this paper's experimental setup."
    434     },
    435     {
    436       "title": "Meta secalign: A secure foundation llm against prompt injection attacks",
    437       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"],
    438       "year": 2025,
    439       "relevance": "State-of-the-art LLM fine-tuning defense against prompt injection, used as a key baseline showing SecInfer's advantage against optimization-based attacks."
    440     },
    441     {
    442       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    443       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    444       "year": 2023,
    445       "arxiv_id": "2302.12173",
    446       "relevance": "Early formalization of indirect prompt injection attacks against LLM-integrated applications, establishing the threat model this paper defends against."
    447     },
    448     {
    449       "title": "Self-consistency improves chain of thought reasoning in language models",
    450       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi"],
    451       "year": 2023,
    452       "relevance": "Key inference-time scaling baseline (self-consistency via majority voting) that SecInfer extends and outperforms for prompt injection defense."
    453     },
    454     {
    455       "title": "Universal and transferable adversarial attacks on aligned language models",
    456       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    457       "year": 2023,
    458       "arxiv_id": "2307.15043",
    459       "relevance": "GCG optimization attack used in this paper's evaluation as an optimization-based prompt injection method."
    460     },
    461     {
    462       "title": "Trading inference-time compute for adversarial robustness",
    463       "authors": ["Wojciech Zaremba", "Evgenia Nitishinskaya", "Boaz Barak"],
    464       "year": 2025,
    465       "arxiv_id": "2501.18841",
    466       "relevance": "Concurrent work exploring inference-time scaling for adversarial robustness including prompt injection; provides the COMPROMISE injected task used in this paper."
    467     },
    468     {
    469       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    470       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    471       "year": 2024,
    472       "relevance": "Agent prompt injection benchmark used to evaluate SecInfer in LLM agent settings, demonstrating ASR reduction from 0.37-0.68 to 0.00-0.02."
    473     },
    474     {
    475       "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    476       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic"],
    477       "year": 2024,
    478       "relevance": "Agent prompt injection benchmark used to evaluate SecInfer, showing ASR reduction from 0.21 to 0.02 while preserving agent utility."
    479     },
    480     {
    481       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    482       "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    483       "year": 2025,
    484       "relevance": "State-of-the-art prompt injection detection method used as baseline (DS+PL); shows complementary detection-based approach to SecInfer's prevention-based defense."
    485     },
    486     {
    487       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    488       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    489       "year": 2024,
    490       "arxiv_id": "2403.03792",
    491       "relevance": "Optimization-based prompt injection attack (NeuralExec) used as a strong attack baseline in the evaluation."
    492     },
    493     {
    494       "title": "A critical evaluation of defenses against prompt injection attacks",
    495       "authors": ["Yuqi Jia", "Zedian Shao", "Yupei Liu", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    496       "year": 2025,
    497       "arxiv_id": "2505.18333",
    498       "relevance": "Comprehensive evaluation of prompt injection defenses showing limitations of fine-tuning approaches against optimization-based attacks, motivating SecInfer's inference-time approach."
    499     },
    500     {
    501       "title": "Optimization-based prompt injection attack to LLM-as-a-judge",
    502       "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu"],
    503       "year": 2024,
    504       "relevance": "JudgeDeceiver attack method used in Adaptive Attack IV to optimize injected prompts that mislead SecInfer's judge LLM component."
    505     },
    506     {
    507       "title": "PromptArmor: Simple yet effective prompt injection defenses",
    508       "authors": ["Tianneng Shi", "Kaijie Zhu", "Zhun Wang"],
    509       "year": 2025,
    510       "arxiv_id": "2507.15219",
    511       "relevance": "Reasoning-LLM-based prompt injection defense used as a baseline, showing limited effectiveness compared to SecInfer."
    512     }
    513   ],
    514   "engagement_factors": {
    515     "practical_relevance": {
    516       "score": 2,
    517       "justification": "SecInfer is a deployable defense for production LLM applications with concrete implementation details, though it requires additional inference compute (~5x overhead)."
    518     },
    519     "surprise_contrarian": {
    520       "score": 1,
    521       "justification": "Extends the existing inference-time scaling paradigm to security rather than challenging conventional wisdom; the finding that vanilla scaling methods fail for prompt injection is moderately surprising."
    522     },
    523     "fear_safety": {
    524       "score": 2,
    525       "justification": "Addresses prompt injection, ranked as the top LLM security threat by OWASP, and demonstrates that existing defenses including fine-tuning are defeated by optimization-based attacks."
    526     },
    527     "drama_conflict": {
    528       "score": 0,
    529       "justification": "Straightforward technical contribution with no controversy or conflict; presents a new defense rather than critiquing existing approaches or organizations."
    530     },
    531     "demo_ability": {
    532       "score": 0,
    533       "justification": "No code release, no demo, no pip-installable tool. Implementation would need to be done from scratch based on the paper's descriptions."
    534     },
    535     "brand_recognition": {
    536       "score": 1,
    537       "justification": "Authors are from Penn State and Duke, respected universities but not top AI labs; evaluates GPT-4o/GPT-4.1 which adds some brand recognition."
    538     }
    539   }
    540 }

Impressum · Datenschutz