scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30306B)
      1 {
      2   "paper": {
      3     "title": "Detection Method for Prompt Injection by Integrating Pre-trained Model and Heuristic Feature Engineering",
      4     "authors": [
      5       "Yi Ji",
      6       "Runzhi Li",
      7       "Baolei Mao"
      8     ],
      9     "year": 2025,
     10     "venue": "Knowledge Science, Engineering and Management",
     11     "arxiv_id": "2506.06384",
     12     "doi": "10.48550/arXiv.2506.06384"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval"
     21   ],
     22   "key_findings": "DMPI-PMHFE, a dual-channel feature fusion framework combining DeBERTa-v3-base semantic extraction with heuristic rule-based feature engineering, outperforms four existing detection baselines (Fmops, ProtectAI, SafeGuard, InjecGuard) on accuracy, recall, and F1-score across three datasets. Ablation experiments confirm each module contributes positively. When deployed as an active defense, it reduces attack success rates to 10-14% across five LLMs (GLM-4, LLaMA 3 variants, Qwen 2.5, GPT-4o), outperforming Self-Reminder and Self-Defense baselines. However, no statistical tests, error bars, or multi-run results are reported.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper creates safeguard-v2 by augmenting a public HuggingFace dataset and constructing deepset-v2 and ivanleomk-v2, but does not release these custom datasets. The base HuggingFace datasets are public, but the augmented versions used for training and evaluation are not made available."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper specifies DeBERTa-v3-base and en_core_web_sm tokenizer, plus training hyperparameters (Section 4.2), but provides no requirements.txt, Python version, GPU specifications, or dependency versions needed to recreate the environment."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A researcher would need to reconstruct the entire pipeline from the method description."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 1, 2, and 3 report only point estimates for all metrics (accuracy, precision, recall, F1, ASR). No confidence intervals, error bars, or uncertainty measures are provided."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims DMPI-PMHFE 'outperforms' and is 'superior to' baselines based solely on comparing raw numbers. No statistical significance tests (p-values, t-tests, etc.) are used."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Results are reported with baseline context throughout. For example, Table 3 and Section 4.3 state 'reduces the ASR of glm-4-9b-chat from 71.71% to 14.34%' and the ablation shows recall improving 'from 93.27% to 98.59%' on safeguard-v2, providing clear before/after context for the magnitude of improvements."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for the dataset sizes (10,400 training, 1,300 test, etc.) or the 251-sample defense benchmark. No power analysis is discussed."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No standard deviations, variance across runs, or any spread measures are reported. All results appear to be from single runs."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Four detection baselines (Fmops, ProtectAI, SafeGuard, InjecGuard) are compared in Table 1, and two defense baselines (Self-Reminder, Self-Defense) plus undefended base models in Table 3."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include InjecGuard (2024), SafeGuard (2023), ProtectAI (2024), and Self-Reminder (2023). These are recent and described as 'currently widely applied on Hugging Face, enjoying high recognition and practical value.'"
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table 2 presents ablation experiments progressively adding modules: M1 (DeBERTa only), M1+M2 (+ synonym matching), M1+M2+M3 (+ pattern matching), showing each module's contribution across all three datasets."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Four metrics (accuracy, precision, recall, F1-score) are used for detection evaluation (Table 1, 2) and attack success rate (ASR) for defense evaluation (Table 3)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of the system's detection outputs is performed. Manual verification was used only during dataset creation (quality assurance for safeguard-v2), not to evaluate the model's predictions."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4.1 states safeguard-v2 is 'divided into training (10,400 samples, 80%), validation (1,300 samples, 10%), and test sets (1,300 samples, 10%).' Additionally, two external validation datasets (deepset-v2, ivanleomk-v2) are used."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down per dataset (safeguard-v2, Ivanleomk-v2, deepset-v2) in Tables 1-2, and per LLM model in Table 3. However, no per-attack-type breakdown is provided."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "No qualitative examples of failures or error analysis are shown. The paper mentions precision decreases when M3 is added and notes performance variation across datasets, but does not examine specific cases where the detector fails."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that adding M3 causes precision to decrease (e.g., from 99.58% to 98.00% on safeguard-v2) and discusses the trade-off: 'M3 expands detection coverage to capture more attack variants, inevitably introducing some false positives.'"
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims DMPI-PMHFE 'outperforms existing methods in terms of accuracy, recall, and F1-score' — Table 1 confirms the highest values in these metrics across all three datasets. The abstract claims it 'significantly reduces attack success rates across mainstream LLMs' — Table 3 confirms the lowest ASR across all five tested LLMs."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The main causal claims come from the ablation study (Table 2), which uses controlled single-variable manipulation: progressively adding M2 and M3 to M1. The claim that each module 'contributes positively' is supported by consistent improvements across all datasets and metrics."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title claims a general 'Detection Method for Prompt Injection' but the paper only tests direct prompt injection (explicitly stated: 'We focus on detecting direct prompt injection'). The paper does not bound its title-level claims to this scope. Additionally, only English-language datasets are used without noting this limitation."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as whether the heuristic rules simply overfit to the specific attack patterns in the test sets, or whether the improvement is driven by dataset overlap between safeguard-v2 training and test distributions."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures detection accuracy/F1 and claims detection effectiveness, and measures ASR and claims defense effectiveness. The measurements directly correspond to the claims — there is minimal proxy-outcome gap."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "DeBERTa-v3-base is specified for the detection model. For LLMs, 'glm-4-9b-chat, Llama-3-8B-Instruct, Llama-3.3-70B-Instruct, Qwen2.5-7B-Instruct' are adequately specified, but 'ChatGPT-4o' lacks a snapshot date or API version. GPT-4o is also used for data generation without version specification."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "GPT-4o was used to generate 3,000 training samples via 'prompt engineering' but the prompts are not provided. The defense evaluation uses a benchmark (ref [28]) but the system prompts or how attacks were presented to LLMs are not fully documented."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4.2 reports: Adam optimizer, cross-entropy loss, learning rate 2e-5, batch size 16, weight decay 0.02, early stopping with patience 3. The many-shot threshold of 3 is also documented. However, LLM inference parameters (temperature, etc.) for defense evaluation are not stated."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. DMPI-PMHFE is a classifier that operates as a filter before LLM input."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.1 documents dataset construction: augmenting xTRam1/safeguard-prompt-injections with 15 attack patterns, generating 3,000 samples via GPT-4o, and a three-stage quality process (manual verification, deduplication, balanced sampling). Train/val/test split ratios (80/10/10) are specified."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "There is no dedicated limitations section. The Conclusion contains two sentences: 'Nevertheless, this study has certain limitations. The precision of DMPI-PMHFE requires further enhancement.' This is insufficient for a substantive discussion."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The only limitation mentioned is precision needing improvement. No specific threats to validity are discussed — no consideration of overfitting to known attack patterns, generalization to new attacks, dataset bias, or language limitations."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 1 explicitly states 'We focus on detecting direct prompt injection,' distinguishing from indirect prompt injection. This bounds the scope of the work to a specific attack category."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The safeguard-v2 dataset, deepset-v2, and ivanleomk-v2 (the augmented versions) are not released. Only the original base datasets are public on HuggingFace."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1 describes data collection: base dataset from HuggingFace (7,000 benign + 3,000 malicious), augmented with 15 attack patterns via GPT-4o (3,000 samples), quality assured through manual verification, deduplication, and balanced sampling."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. Data sources are public HuggingFace datasets and GPT-4o generated samples."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from base dataset (10,000) + GPT-4o generation (3,000) → safeguard-v2 (13,000) → 80/10/10 split (10,400/1,300/1,300) is documented with counts at each stage. External validation datasets (354 and 610 samples) are identified by source."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information or acknowledgments section is present in the paper."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All three authors list Zhengzhou University affiliations with ORCID IDs and email addresses. They are not evaluating their own commercial product."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "No funding is disclosed. The work appears to be unfunded university research."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial disclosure statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper trains its own classifier (fine-tuned DeBERTa) and tests defenses against prompt injection attacks. It does not evaluate a pre-trained model's zero-shot capability on a benchmark. The LLMs are tested for vulnerability, not knowledge."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Same rationale: the paper tests defense effectiveness, not model knowledge capability on benchmarks."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Same rationale: this is a defense evaluation study, not a model capability benchmark."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference latency or cost is reported for the detection model. For a system proposed as an active defense filter that processes every input before reaching the LLM, latency is a critical practical concern."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No GPU hours, training time, hardware specifications, or total computational budget are stated anywhere in the paper."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is never stated. It is unclear whether results are from one run or averaged over multiple runs."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "A sensitivity analysis is described for the many-shot threshold (selected as 3), but no overall hyperparameter search budget is reported — number of configurations tried, search method, or compute spent on tuning are not documented."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The many-shot threshold selection is justified via sensitivity analysis, but the overall model configuration (learning rate, batch size, weight decay) appears to use standard defaults without justification for why these specific values were chosen."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, so correction for multiple comparisons is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors evaluate their own system against baselines without acknowledging author-evaluation bias. While the baselines are published HuggingFace models (not re-implementations), the evaluation setup and datasets are controlled by the authors."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No comparison of computational costs between DMPI-PMHFE and baselines. The dual-channel architecture may be more expensive than single-model baselines, but this is not discussed."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether the benchmarks (safeguard-v2, deepset-v2, ivanleomk-v2) adequately represent real-world prompt injection attacks, or whether high detection scores on these datasets translate to effective real-world defense."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved in the detection approach."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "DeBERTa-v3-base is pre-trained on general text. No discussion of whether any test data patterns appeared in DeBERTa's pre-training corpus or in GPT-4o's training data (used to generate samples)."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether the heuristic features (keyword lists, pattern rules) derived from training data leak information about the test distribution, or whether GPT-4o-generated training samples share distributional properties with the test set."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Training and test data for safeguard-v2 come from the same augmented dataset via random splitting. No discussion of whether samples within the same attack category share structural similarities that could inflate test performance."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention methods are used. No deduplication between train and test beyond the general deduplication step during dataset construction."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "DMPI-PMHFE outperforms existing detection methods (Fmops, ProtectAI, SafeGuard, InjecGuard) in accuracy, recall, and F1-score across three benchmark datasets.",
    374       "evidence": "Table 1 (Section 4.3): Highest accuracy (97.94%, 94.75%, 91.24%), recall (98.59%, 93.93%, 84.31%), and F1-score (98.29%, 96.03%, 90.21%) on safeguard-v2, Ivanleomk-v2, and deepset-v2 respectively.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Each module (DeBERTa, synonym matching, pattern matching) contributes positively to detection performance.",
    379       "evidence": "Table 2 (Section 4.3): Progressive addition of M2 and M3 consistently improves accuracy, recall, and F1 across all three datasets. E.g., recall on safeguard-v2 goes from 93.27% (M1) to 95.64% (M1+M2) to 98.59% (M1+M2+M3).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "DMPI-PMHFE significantly reduces attack success rates across mainstream LLMs compared to Self-Reminder and Self-Defense.",
    384       "evidence": "Table 3 (Section 4.3): ASR reduced to 10.35-14.34% across five LLMs. On glm-4-9b-chat, ASR drops from 71.71% (undefended) to 14.34% (DMPI-PMHFE) vs 35.45% (Self-Reminder) and 39.04% (Self-Defense).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "DMPI-PMHFE offers consistent protection across diverse LLM architectures, unlike Self-Reminder and Self-Defense which vary significantly.",
    389       "evidence": "Table 3: DMPI-PMHFE ASR ranges 10.35-14.34% (4pp spread) across LLMs, while Self-Reminder ranges 19.52-39.84% (20pp spread) and Self-Defense 15.53-41.03% (25pp spread).",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No error bars or uncertainty quantification",
    396       "detail": "All results across Tables 1-3 are point estimates with no confidence intervals, standard deviations, or indication of how many runs were performed. It is impossible to know if the reported differences are meaningful or within noise."
    397     },
    398     {
    399       "flag": "Best performance on internal dataset",
    400       "detail": "DMPI-PMHFE performs best on safeguard-v2, which the authors created and whose distribution is 'closely aligned with the training data' (their own words). Performance degrades notably on external datasets (e.g., F1 from 98.29% to 90.21% on deepset-v2)."
    401     },
    402     {
    403       "flag": "GPT-4o used for both training data generation and defense evaluation",
    404       "detail": "3,000 training samples were generated by GPT-4o, and GPT-4o is also one of the five LLMs evaluated for defense effectiveness. The paper does not discuss whether GPT-4o-generated attack patterns in training data bias the model toward defending GPT-4o specifically."
    405     },
    406     {
    407       "flag": "No code or custom dataset release",
    408       "detail": "Neither the code nor the safeguard-v2 dataset is released, making independent verification or replication impossible."
    409     },
    410     {
    411       "flag": "No latency analysis for proposed active defense",
    412       "detail": "DMPI-PMHFE is proposed as a real-time pre-processing filter for every LLM input. No inference latency or throughput analysis is provided, which is critical for practical deployment."
    413     },
    414     {
    415       "flag": "Small defense evaluation benchmark",
    416       "detail": "Defense effectiveness is evaluated on only 251 attack samples from a single benchmark (CyberSecEval 2). The diversity and representativeness of these attacks relative to real-world threats is not discussed."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    422       "authors": [
    423         "Kai Greshake",
    424         "Sahar Abdelnabi",
    425         "Shailesh Mishra",
    426         "Christoph Endres",
    427         "Thorsten Holz",
    428         "Mario Fritz"
    429       ],
    430       "year": 2023,
    431       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, defines the attack taxonomy this paper builds on."
    432     },
    433     {
    434       "title": "Ignore previous prompt: Attack techniques for language models",
    435       "authors": [
    436         "Fábio Perez",
    437         "Ian Ribeiro"
    438       ],
    439       "year": 2022,
    440       "relevance": "Early systematic study of direct prompt injection attack techniques for language models."
    441     },
    442     {
    443       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    444       "authors": [
    445         "Yupei Liu",
    446         "Yuqi Jia",
    447         "Runpeng Geng",
    448         "Jinyuan Jia",
    449         "Neil Zhenqiang Gong"
    450       ],
    451       "year": 2024,
    452       "relevance": "Provides formal framework and benchmarks for evaluating prompt injection attacks and defenses."
    453     },
    454     {
    455       "title": "InjecGuard: Benchmarking and mitigating over-defense in prompt injection guardrail models",
    456       "authors": [
    457         "Hao Li",
    458         "Xiaogeng Liu"
    459       ],
    460       "year": 2024,
    461       "arxiv_id": "2410.22770",
    462       "relevance": "Addresses the over-defense problem in prompt injection detection models, serving as a baseline in this paper."
    463     },
    464     {
    465       "title": "StruQ: Defending against prompt injection with structured queries",
    466       "authors": [
    467         "Sizhe Chen",
    468         "Julien Piet",
    469         "Chawin Sitawarin",
    470         "David Wagner"
    471       ],
    472       "year": 2024,
    473       "arxiv_id": "2402.06363",
    474       "relevance": "Architecture-based defense that separates prompts and data into two channels to prevent injection."
    475     },
    476     {
    477       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    478       "authors": [
    479         "Julien Piet",
    480         "Maha Alrashed",
    481         "Chawin Sitawarin",
    482         "Sizhe Chen"
    483       ],
    484       "year": 2024,
    485       "relevance": "Defense method using non-instruction fine-tuning for specific tasks, representing architecture-based defenses."
    486     },
    487     {
    488       "title": "LLM Self Defense: By self examination, LLMs know they are being tricked",
    489       "authors": [
    490         "Mansi Phute",
    491         "Alec Helbling",
    492         "Matthew Daniel Hull",
    493         "ShengYun Peng"
    494       ],
    495       "year": 2024,
    496       "relevance": "Self-supervision defense baseline where LLMs evaluate their own outputs for harmful content."
    497     },
    498     {
    499       "title": "Defending ChatGPT against jailbreak attack via self-reminders",
    500       "authors": [
    501         "Yueqi Xie",
    502         "Jingwei Yi",
    503         "Jiawei Shao",
    504         "Justin Curl"
    505       ],
    506       "year": 2023,
    507       "relevance": "Self-reminder defense baseline that integrates system prompts into user queries to enhance LLM safety."
    508     },
    509     {
    510       "title": "Many-shot jailbreaking",
    511       "authors": [
    512         "Cem Anil",
    513         "Esin Durmus",
    514         "Nina Panickssery",
    515         "Mrinank Sharma"
    516       ],
    517       "year": 2025,
    518       "relevance": "Describes the many-shot jailbreaking attack pattern that this paper's pattern matching module specifically targets."
    519     },
    520     {
    521       "title": "Security and privacy challenges of large language models: A survey",
    522       "authors": [
    523         "Badhan Chandra Das",
    524         "M Hadi Amini",
    525         "Yanzhao Wu"
    526       ],
    527       "year": 2025,
    528       "relevance": "Comprehensive survey of LLM security and privacy challenges providing broader context for prompt injection defense work."
    529     },
    530     {
    531       "title": "CyberSecEval 2: A wide-ranging cybersecurity evaluation suite for large language models",
    532       "authors": [
    533         "Manish Bhatt",
    534         "Sahana Chennabasappa",
    535         "Yue Li",
    536         "Cyrus Nikolaidis"
    537       ],
    538       "year": 2024,
    539       "arxiv_id": "2404.13161",
    540       "relevance": "Provides the 251-sample prompt injection benchmark used for defense effectiveness evaluation in this paper."
    541     },
    542     {
    543       "title": "Soft begging: Modular and efficient shielding of LLMs against prompt injection and jailbreaking based on prompt tuning",
    544       "authors": [
    545         "Simon Ostermann",
    546         "Kevin Baum",
    547         "Christoph Endres"
    548       ],
    549       "year": 2024,
    550       "arxiv_id": "2407.03391",
    551       "relevance": "Modular defense approach against prompt injection using prompt tuning, addressing similar goals of protecting LLMs."
    552     }
    553   ],
    554   "engagement_factors": {
    555     "practical_relevance": {
    556       "score": 1,
    557       "justification": "Proposes a prompt injection detection framework but releases no code, no dataset, and no latency analysis, making it unusable without significant reimplementation."
    558     },
    559     "surprise_contrarian": {
    560       "score": 0,
    561       "justification": "Confirms the expected finding that combining semantic and heuristic features improves detection over either alone, with no counterintuitive results."
    562     },
    563     "fear_safety": {
    564       "score": 1,
    565       "justification": "Addresses prompt injection as a security threat but focuses on defense rather than demonstrating novel attacks or revealing new vulnerabilities."
    566     },
    567     "drama_conflict": {
    568       "score": 0,
    569       "justification": "No controversy, no challenge to specific companies or popular approaches — straightforward incremental improvement over existing baselines."
    570     },
    571     "demo_ability": {
    572       "score": 0,
    573       "justification": "No code, no dataset, no demo released; the custom safeguard-v2 dataset and model weights are unavailable."
    574     },
    575     "brand_recognition": {
    576       "score": 0,
    577       "justification": "From Zhengzhou University with no well-known authors; published in a niche KSEM workshop, not a major venue."
    578     }
    579   }
    580 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs