scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33310B)
      1 {
      2   "paper": {
      3     "title": "PIGuard: Prompt Injection Guardrail via Mitigating Overdefense for Free",
      4     "authors": [
      5       "Hao Li",
      6       "Xiaogeng Liu",
      7       "Ning Zhang",
      8       "Chaowei Xiao"
      9     ],
     10     "year": 2025,
     11     "venue": "ACL 2025 (Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics, Volume 1: Long Papers)",
     12     "doi": "10.18653/v1/2025.acl-long.1468"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "PIGuard, a DeBERTa-based prompt guard model trained with the Mitigating Over-defense for Free (MOF) strategy, achieves 83.48% average accuracy across benign, malicious, and over-defense detection, substantially outperforming the best open-source baseline ProtectAIv2 (64.01%) while maintaining comparable inference speed. The NotInject benchmark reveals that all existing open-source prompt guard models suffer severe over-defense, with accuracy below 60% (near random guessing at 50%). The MOF approach, which identifies biased tokens via single-token probing and generates debiasing training data, effectively mitigates over-defense without requiring any over-defense-specific dataset, and outperforms traditional shortcut mitigation methods that degrade malicious detection performance.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states in the abstract: 'The code and datasets are released at https://github.com/leolee99/PIGuard.' A specific GitHub URL is provided."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The NotInject dataset and training data are released at the GitHub repository. Additionally, the training data is collected from 20 open-source datasets which are all individually cited and publicly available (Tables 4-6, Appendix A)."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper specifies the backbone (DeBERTaV3-base) and training hyperparameters (Sec 5.1) but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Training details are given in Section 5.1 (batch size, epochs, learning rate, optimizer) but the paper does not include step-by-step reproduction instructions. A GitHub repository is linked but its contents cannot be verified from the paper text."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 1, 2, 3, 7, 9, and 10 report only point estimates (e.g., '83.48%') with no confidence intervals, error bars, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims PIGuard 'surpasses' and 'outperforms' baselines based solely on comparing raw accuracy numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used for any comparative claims."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper consistently reports improvements with baseline context, e.g., 'PIGuard achieves an average accuracy of 83.48%... exceeding the top open-source model, ProtectAIv2, by 30.4%' (Sec 5.2). Table 1 provides all absolute numbers enabling magnitude assessment."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "NotInject contains 339 samples (113 per difficulty level). The MOF generates 1,000 samples. Neither sample size is justified through power analysis or principled reasoning. The 1,000 MOF sample count is selected via ablation (Table 9) rather than a priori justification."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Results appear to be single-run with no mention of multiple experimental runs, random seeds, or any spread measure (std dev, IQR). No variance is reported anywhere in the paper."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against 5 prompt guard models (Fmops, Deepset, PromptGuard, ProtectAIv2, LakeraGuard) and 3 LLM-based methods (GPT-4o, Llama-2-chat, LlamaGuard3), as shown in Table 1."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include current state-of-the-art models: ProtectAIv2 (2024), LakeraGuard (2024), GPT-4o (2024), and LlamaGuard3 (2024). These represent the most recent prompt guard solutions available."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 2 presents a comprehensive ablation study showing the individual and combined effects of basic dataset training, data-centric augmentation, and MOF (with finetuning vs. scratch retrain). Table 3 compares MOF against standard shortcut mitigation. Table 9 ablates MOF sampling scale."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper evaluates on three accuracy dimensions (over-defense, benign, malicious), plus GFLOPs, inference time, and efficiency (accuracy/time). Table 1 reports all six metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All evaluation of PIGuard and baselines is fully automated using accuracy on benchmark datasets. No human evaluation of the system's detection outputs was conducted. Human evaluators were only used for dataset construction (trigger word refinement), not system evaluation."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "NotInject is explicitly not used for training (MOF generates its own data independently). PINT and WildGuard are external benchmarks. BIPIA uses a train/test split (Table 5 lists 'BIPIA_train' separately from test evaluation in Table 7)."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 7 provides per-benchmark breakdowns for all models across NotInject subsets (one-word, two-word, three-word), WildGuard, PINT, and BIPIA. Table 8 shows topic category distribution within NotInject."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper shows failure cases of OTHER models (Figures 2, 3, 7) but does not analyze where PIGuard itself fails. No error analysis or qualitative examples of PIGuard's failures are provided despite ~17% error rate."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Table 2 shows data-centric augmentation alone HURTS over-defense accuracy (75.22% → 64.31%). Table 3 shows standard shortcut mitigation degrades malicious accuracy by 10.42%. Table 9 shows diminishing returns at 2,000 MOF samples."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of 'surpassing the existing best model by 30.4%' and '87.32% over-defense accuracy' are supported by Table 1 data, though the 30.4% figure represents relative improvement ((83.48-64.01)/64.01) rather than the more commonly expected absolute difference (19.47pp), which is slightly misleading but technically supported."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims about MOF improving performance are supported by controlled ablation studies (Table 2) with single-variable manipulation: basic dataset → +augmentation → +MOF, with each component isolated. Table 3 further compares MOF against an alternative approach using the same base model."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The abstract claims PIGuard is 'a robust and open-source solution for detecting prompt injection attacks' without bounding to the tested benchmarks (PINT, BIPIA, WildGuard, NotInject). The limitations section acknowledges gaps in domain-specific applications but the title and abstract framing is broader than the tested settings."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper attributes PIGuard's success entirely to MOF and data-centric augmentation without discussing alternative explanations. For example, the specific choice of DeBERTaV3-base backbone, the training data distribution, or the particular mix of open-source datasets could contribute to the results, but these are not explored."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures accuracy on specific benchmarks and frames results in terms of these specific metrics (over-defense accuracy, benign accuracy, malicious accuracy). Claims match the granularity of measurements without inflating proxy metrics to broader outcomes."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "DeBERTaV3-base is specified. GPT-4o-mini is versioned ('2024-07-18 version'). However, GPT-4o is used as a key baseline without a snapshot date or API version. LlamaGuard3 lacks a specific version. Multiple models lack precise versioning."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt text is provided in Appendix D: Figure 10 (word-based generation), Figure 11 (LLM injection detection, used for both data refinement and LLM baseline evaluation), and Figure 12 (data augmentation). The paper states 'the same prompts are employed to facilitate malicious content detection by GPT-4o and Llama-2-chat.'"
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "PIGuard training hyperparameters are well-documented (Sec 5.1: batch size 32, 3 epochs, Adam optimizer, lr 2e-5, 100-step warmup, max token length 512). However, temperature and sampling settings for GPT-4o, GPT-4o-mini, and Llama-2-chat API calls are not stated, which affects reproducibility of both data generation and baseline evaluation."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "PIGuard is a text classification model (DeBERTa-based). No agentic scaffolding is used."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.2 documents the full NotInject construction pipeline (trigger word identification → refinement → corpus generation) with Algorithm 1. Section 4.1 details training data collection from 20 datasets with counts (Tables 4-6). Section 4.2 describes MOF data generation. Data pipeline stages and counts are documented."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "A dedicated 'Limitations' section is present after the Conclusions, discussing the NotInject dataset's potential shortcomings in capturing real-world diversity, particularly in domain-specific applications like healthcare and finance."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The limitations section mentions specific concerns: 'the NotInject dataset, while carefully designed, may not fully capture the diversity of real-world benign inputs, particularly in domain-specific applications. This could result in the underestimation of models' over-defense tendency in complex, sensitive fields such as healthcare or finance.'"
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The limitations discuss future work directions but do not explicitly state what the results do NOT show. No explicit boundaries on which languages, attack types, or deployment scenarios are excluded from the claims. The paper does not state specific things it did not test beyond the general mention of domain-specific gaps."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The NotInject dataset and code are released at https://github.com/leolee99/PIGuard. Training data is sourced from 20 publicly available open-source datasets, all individually cited with links in Appendix A."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.2 provides detailed data collection procedures for NotInject (trigger word identification via word frequency analysis, refinement via LLM + human evaluation, corpus generation). Section 4.1 describes training data collection from 20 sources with counts. Algorithm 1 formalizes the trigger word identification process."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "Three human evaluators with 'security expertise' were used for trigger word refinement (Sec 3.2), but their recruitment method, specific qualifications, and potential selection biases are not described. Only the evaluation protocol (5-point frequency scale, Figure 6) is provided."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The full data pipeline is documented: Section 3.2 shows the 3-step NotInject construction pipeline with Figure 4. Section 4.1 documents training data aggregation from 20 sources with counts at each stage (Tables 4-6). Section 4.2 describes the MOF pipeline (token recheck → generation → refinement → retraining). Figure 9 reports error ratios after generation."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources, grants, or sponsors are mentioned anywhere in the paper. There is no acknowledgments section disclosing funding."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: Washington University in St. Louis and University of Wisconsin-Madison. These are academic institutions not directly affiliated with any evaluated product."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence of funding cannot be assessed. The university affiliations suggest likely academic funding (NSF, etc.) which would be independent, but this is not stated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is present in the paper. There is no declaration of financial interests, patents, or equity."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper evaluates a defense tool (PIGuard, a fine-tuned DeBERTa classifier) rather than testing a pre-trained model's knowledge on benchmarks. The LLM baselines (GPT-4o, etc.) are contextual comparisons, not the main contribution."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This paper tests a defense tool rather than evaluating pre-trained model capability on benchmarks. Per schema guidelines, contamination items are NA for studies that test defenses/tools rather than model knowledge."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "This paper tests a defense tool rather than evaluating pre-trained model capability on benchmarks. Per schema guidelines, contamination items are NA for defense/tool evaluation studies."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human subjects study is conducted. Three human evaluators assist with dataset annotation (trigger word refinement) but this is not a human subjects study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human subjects study is conducted. The paper includes an Ethics Statement but no IRB approval is needed for dataset annotation work."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study. Human evaluators are described only as having 'security expertise.'"
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study is conducted."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study is conducted."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study is conducted."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects study is conducted."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 1 reports GFLOPs (60.45) and inference time (15.34ms) for PIGuard and all baselines. An efficiency metric (accuracy/inference time) is also computed. Figure 1 visualizes the performance-efficiency tradeoff."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Training compute is not stated. The paper reports training hyperparameters (Sec 5.1) but does not quantify total GPU hours, training time, or API costs for data generation with GPT-4o-mini."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single or multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters are stated (Sec 5.1) but no search budget, number of configurations tried, or search method is reported. Only the MOF sampling scale is ablated (Table 9 tests 3 values)."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Table 9 transparently shows the selection of 1,000 MOF samples by comparing 500, 1,000, and 2,000 samples with the 1,000 configuration yielding the best average accuracy (83.48%). Table 2 shows all ablation configurations that led to the final design."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Multiple comparisons are made across 8+ models on 4+ benchmarks with no statistical tests at all, let alone multiple comparison corrections."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors train and evaluate PIGuard against baselines without acknowledging author-evaluation bias. While the baselines are existing models (not re-implemented), the evaluation setup and benchmark selection are controlled by the authors."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Table 1 reports both performance (accuracy) and compute (GFLOPs, inference time) for all models. Figure 1 plots performance vs. time efficiency. The paper discusses how PIGuard achieves comparable accuracy to GPT-4o at 503x higher efficiency."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses PINT, BIPIA, WildGuard, and NotInject benchmarks without discussing whether these benchmarks capture real-world prompt injection detection needs or whether benchmark performance translates to deployment effectiveness."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. All models are classifiers or directly prompted LLMs, with no scaffold-dependent comparisons."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Training data includes datasets like BIPIA_train, TaskTracker, and jailbreak-classification that share origins with test benchmarks. No discussion of temporal relationships between training and test data creation."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup could inadvertently leak information. For example, the NotInject dataset is constructed using the same trigger word identification methodology that could overlap with PIGuard's MOF-identified tokens."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Training uses BIPIA_train (Table 5) while testing on BIPIA (Table 7). Training uses TaskTracker and jailbreak-classification which share structural similarities with test sets. The independence of train and test examples is not explicitly verified or discussed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is applied. No decontamination pipeline, overlap analysis, or membership checks between training and test data."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "PIGuard achieves 83.48% average accuracy across benign, malicious, and over-defense detection, surpassing the best open-source model (ProtectAIv2) by 30.4%.",
    369       "evidence": "Table 1 shows PIGuard at 83.48% average accuracy vs. ProtectAIv2 at 64.01%. The 30.4% is relative improvement ((83.48-64.01)/64.01 ≈ 30.4%), not absolute difference (19.47pp). Section 5.2.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Existing open-source prompt guard models suffer severe over-defense, with accuracy below 60% (near random guessing at 50%).",
    374       "evidence": "Table 1 shows over-defense accuracy: Fmops 28.32%, Deepset 29.50%, PromptGuard 0.29%, ProtectAIv2 57.23%. All below 60%. Evaluated on NotInject (339 samples). Section 3.3 and Figure 5.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The MOF strategy effectively mitigates over-defense without relying on over-defense-specific datasets.",
    379       "evidence": "Table 2 ablation shows MOF with scratch retrain improves over-defense from 64.31% to 87.32% when combined with data-centric augmentation, using only automatically generated debiasing data. Section 4.2.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "PIGuard achieves performance comparable to GPT-4o while being a lightweight DeBERTa-based model.",
    384       "evidence": "Table 1: PIGuard 83.48% average vs. GPT-4o 85.53% average, with PIGuard inference time of 15.34ms vs. GPT-4o's 7907.18ms. PIGuard is 503x more efficient. Section 5.2.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Data-centric augmentation alone worsens the over-defense problem.",
    389       "evidence": "Table 2: Over-defense accuracy drops from 75.22% (basic dataset) to 64.31% after adding data-centric augmentation. Section 5.3.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "MOF outperforms traditional shortcut mitigation methods, which degrade malicious detection performance.",
    394       "evidence": "Table 3: Standard shortcut mitigation achieves 86.73% over-defense but only 65.53% malicious (10.42% drop). MOF achieves 87.32% over-defense AND 77.39% malicious. Section 5.4.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No variance or uncertainty reporting",
    401       "detail": "All results across Tables 1, 2, 3, 7, 9, and 10 report single point estimates with no error bars, confidence intervals, standard deviations, or multiple-run results. It is impossible to assess result stability."
    402     },
    403     {
    404       "flag": "No statistical significance tests",
    405       "detail": "All comparative claims ('surpasses', 'outperforms') are based on raw number comparisons without any statistical significance testing. Given the lack of variance reporting, it is unclear whether the differences are meaningful."
    406     },
    407     {
    408       "flag": "Ambiguous improvement percentage",
    409       "detail": "The abstract and Section 5.2 claim PIGuard 'surpasses' ProtectAIv2 'by 30.4%'. This is a relative improvement, but 'by X%' is conventionally read as percentage points in ML. The absolute improvement is 19.47pp. Table 1 caption reports 30.8%, inconsistent with the 30.4% figure."
    410     },
    411     {
    412       "flag": "Potential train/test data overlap",
    413       "detail": "Training data includes datasets (BIPIA_train, TaskTracker, jailbreak-classification) from the same sources as test benchmarks. While train/test splits appear to be honored, independence is not explicitly verified or discussed."
    414     },
    415     {
    416       "flag": "LLM-dependent data construction and evaluation",
    417       "detail": "GPT-4o-mini is used for both NotInject corpus generation AND safety verification (refinement), creating a potential circularity. If GPT-4o-mini has systematic blind spots, both construction and verification would share them."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Ignore Previous Prompt: Attack Techniques For Language Models",
    423       "authors": ["Fábio Perez", "Ian Ribeiro"],
    424       "year": 2022,
    425       "arxiv_id": "2211.09527",
    426       "relevance": "Foundational work identifying the concept of prompt injection attacks, including goal hijacking and prompt leakage."
    427     },
    428     {
    429       "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    430       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    431       "year": 2023,
    432       "arxiv_id": "2302.12173",
    433       "relevance": "Key work on indirect prompt injection attacks against real-world LLM-integrated applications."
    434     },
    435     {
    436       "title": "Automatic and universal prompt injection attacks against large language models",
    437       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    438       "year": 2024,
    439       "arxiv_id": "2403.04957",
    440       "relevance": "Automatic attack generation methods for prompt injection, providing context for defense evaluation."
    441     },
    442     {
    443       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    444       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    445       "year": 2023,
    446       "arxiv_id": "2310.12815",
    447       "relevance": "Survey of prompt injection attack and defense landscape for LLM-integrated applications."
    448     },
    449     {
    450       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    451       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    452       "year": 2023,
    453       "arxiv_id": "2312.14197",
    454       "relevance": "BIPIA benchmark for indirect prompt injection, used as a key evaluation dataset in this paper."
    455     },
    456     {
    457       "title": "Are you still on track!? Catching LLM task drift with activations",
    458       "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"],
    459       "year": 2024,
    460       "arxiv_id": "2406.00799",
    461       "relevance": "TaskTracker dataset for detecting LLM task drift from prompt injection, used in training data."
    462     },
    463     {
    464       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    465       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    466       "year": 2024,
    467       "arxiv_id": "2406.13352",
    468       "relevance": "Dynamic evaluation environment for LLM agent attacks and defenses."
    469     },
    470     {
    471       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    472       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    473       "year": 2023,
    474       "arxiv_id": "2312.06674",
    475       "relevance": "LLM guardrail approach for detecting unsafe content, represents alternative defense paradigm to prompt guard models."
    476     },
    477     {
    478       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    479       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David A. Wagner"],
    480       "year": 2024,
    481       "relevance": "Structured query defense approach against prompt injection, an alternative defense strategy."
    482     },
    483     {
    484       "title": "Neural Exec: Learning (and Learning from) Execution Triggers for Prompt Injection Attacks",
    485       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    486       "year": 2024,
    487       "arxiv_id": "2403.03792",
    488       "relevance": "Novel attack method using learned execution triggers for prompt injection."
    489     },
    490     {
    491       "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
    492       "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes"],
    493       "year": 2023,
    494       "arxiv_id": "2311.01011",
    495       "relevance": "Gamified prompt injection attack dataset providing real-world attack examples."
    496     },
    497     {
    498       "title": "WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks",
    499       "authors": ["Seungju Han", "Kavel Rao", "Allyson Ettinger"],
    500       "year": 2024,
    501       "arxiv_id": "2406.18495",
    502       "relevance": "Open-source moderation tool and benchmark used for benign accuracy evaluation in this paper."
    503     },
    504     {
    505       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    506       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    507       "year": 2024,
    508       "arxiv_id": "2403.02691",
    509       "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents, used as training data source."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 3,
    515       "justification": "PIGuard is a fully open-source, lightweight prompt guard model with released code, data, and model weights that a practitioner could deploy immediately for prompt injection detection."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "The finding that guard models have over-defense issues is somewhat expected given known shortcut learning problems in NLP classifiers; the MOF solution is novel but not contrarian."
    520     },
    521     "fear_safety": {
    522       "score": 2,
    523       "justification": "Demonstrates that existing prompt injection defenses (including Meta's PromptGuard) are unreliable, with accuracy near random guessing on trigger-word inputs, raising concerns about deployed defense systems."
    524     },
    525     "drama_conflict": {
    526       "score": 1,
    527       "justification": "Names specific commercial and open-source products (Meta PromptGuard, ProtectAI, Lakera) as deficient but presents findings in standard academic framing without controversy."
    528     },
    529     "demo_ability": {
    530       "score": 3,
    531       "justification": "Code, model, and datasets are released on GitHub (https://github.com/leolee99/PIGuard), making it immediately testable."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "From university researchers (WashU, UW-Madison) rather than a major AI lab; published at ACL which is prestigious but not a household name."
    536     }
    537   }
    538 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs