scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24679B)
      1 {
      2   "paper": {
      3     "title": "Prompt Injection Attacks in Large Language Models via a Comprehensive Analysis of Attack Vectors, Defense Mechanisms, and Future Directions",
      4     "authors": ["Ilkin Javadov"],
      5     "year": 2025,
      6     "venue": "Proceedings of IAM, V.14, N.2, 2025",
      7     "doi": "10.30546/2225-0530.14.2.2025.2013"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Empirical evaluation on GPT-4, Claude 3 Opus, and Gemini 1.5 Pro using 1,000 attack scenarios shows baseline vendor safeguards provide ~58% protection (42% ASR). A proposed six-layer defense-in-depth framework reduces ASR to 3.2% ± 1.1%. Ablation identifies architectural separation and multi-model verification as the key contributors. Analysis of five real-world incidents (2023–2025) documents financial losses up to $2.3M and detection delays up to 45 days.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The curated dataset of 1,000 attack scenarios is not released. No download link or data availability statement is provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions are provided. A researcher could not replicate the experiments from the information given."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper reports '58% ± 3.2%' for baseline protection and '3.2% ± 1.1%' for the proposed framework, described as '95% CI' in the methods section."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper reports 'p < 0.001 vs. baseline' and states 'paired t-tests' were used in the statistical methods."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports '92.4% relative reduction' in ASR, with baseline context (from 42% ASR to 3.2% ASR), providing magnitude in context."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper uses 1,000 attack scenarios but provides no justification for this number. No power analysis or rationale for why 1,000 is sufficient."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper states '3 runs/scenario for variance' and reports ± values (e.g., '3.2% ± 1.1%') across runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares against baseline vendor safeguards (~58% protection) and state-of-the-art single-layer defenses (~82%), establishing comparison points for the proposed framework."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include current vendor safeguards and reference recent techniques: spotlighting [15], multi-model verification [16], Azure AI Content Safety prompt shields [18], and Anthropic's alignment methods [17]."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "An ablation is claimed ('Ablation confirms architectural separation and multi-model verification as key contributors') but no ablation data, tables, or per-layer contribution figures are shown anywhere in the paper."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports attack success rate (ASR), false-positive rate (~5.3%), and latency overhead (+18%), providing multiple evaluation dimensions."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Attack success was determined via 'regex + manual review, Kappa=0.92' — human reviewers assessed whether model outputs achieved adversarial goals."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No train/test split or held-out evaluation is described. All 1,000 scenarios appear to be used for evaluation without separation from any tuning of defense parameters."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "The dataset composition is described (40% direct, 30% indirect, etc.) but results are only reported as aggregates. No breakdown of ASR by attack category, model, or defense layer."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The residual 3.2% failure rate is acknowledged but no specific failure cases are analyzed. No examples of attacks that bypassed the defense framework are shown."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No negative results are reported. No discussion of approaches that were tried and failed or configurations that didn't work."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims (58% baseline, 82% single-layer, 3.2% framework ASR) are restated in the results and conclusion sections with matching numbers."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The causal claim that the defense framework 'reduces' ASR is supported by controlled comparison: same 1,000 scenarios tested with and without defense layers. The ablation design (removing components) is an adequate causal identification strategy, though the ablation data itself is not shown."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper tests only GPT-4 (0613), Claude 3 Opus, and Gemini 1.5 Pro, but the title and conclusion make broad claims about 'Large Language Models' and 'modern AI systems' without bounding to the three tested models."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are discussed. The paper does not consider confounds such as whether the defense framework's effectiveness is specific to the attack dataset used, or whether the 1,000 scenarios are representative of real-world attacks."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures ASR (attack success rate) and frames results in terms of ASR and protection rate. The measured proxies match the claims — no inflated framing beyond what was measured."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are provided: 'GPT-4 (0613, OpenAI API)', 'Claude 3 Opus (v1.2)', 'Gemini 1.5 Pro (2025-11)'."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "No actual attack prompts or system prompts are provided. The attack scenarios are described categorically (direct, indirect, etc.) but no example prompts are shown."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Hyperparameters are stated: 'Temp=0.7, max_tokens=512, safety on.'"
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The six-layer defense architecture is a filter pipeline, not an agentic system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Dataset construction is described only as '1,000 scenarios manually crafted + validated ... Sources: augmented from GLUE/Alpaca + taxonomy-based attacks.' How GLUE/Alpaca (NLP benchmarks) were converted to attack scenarios is unexplained. Filtering and augmentation steps are not documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section exists. The conclusion mentions 'residual 3.2% vulnerability highlights fundamental limitations in current LLM designs' but this discusses general LLM limitations, not methodological limitations of the study itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. No mention of dataset representativeness, potential biases in manual annotation, or external validity of results from three models."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what settings/models/attack types are excluded from the conclusions."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The 1,000 attack scenarios and raw model outputs are not available for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Data collection is described only superficially: 'manually crafted + validated' and 'augmented from GLUE/Alpaca + taxonomy-based attacks.' The actual crafting methodology, augmentation process, and validation criteria are not explained."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "Three annotators validated the dataset (Kappa=0.92) but their qualifications, expertise, and selection are not described. The provenance and representativeness of the 1,000 scenarios is undocumented."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No data pipeline is documented. The paper does not explain how raw GLUE/Alpaca data was transformed into attack scenarios, what filtering was applied, or how the 40%/30% category distribution was determined."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliation is listed: Azerbaijan Technical University, Baku, Azerbaijan. The author is not affiliated with any of the evaluated model providers."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence of funding from outcomes cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests defense mechanisms against prompt injection attacks rather than evaluating model knowledge on a benchmark. Contamination is structurally inapplicable."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper tests defense mechanisms, not model capability on knowledge-based benchmarks. Train/test overlap is not a relevant concern."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper evaluates attack success rates against defenses, not model knowledge retrieval. Benchmark contamination is structurally inapplicable."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study. The 3 annotators validated the dataset but were not study subjects."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Latency overhead is reported: 'Latency +18% avg.' for the defense framework. However, no monetary cost or token consumption figures are given."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget (GPU hours, API spend, hardware) is stated for the experiments."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The paper runs '3 runs/scenario for variance' and reports results with ± values, capturing run-to-run variation from temperature sampling."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Explicitly stated: '3 runs/scenario for variance.'"
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search is described for the defense framework components. Thresholds and parameters for the 6 defense layers appear untuned or tuned without documentation."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper presents one configuration of the 6-layer framework without explaining how parameters were selected or whether alternatives were tried."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "Only one formal statistical comparison is reported (framework vs. baseline, p<0.001). Multiple comparison correction is not needed."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors designed and evaluated their own defense framework without acknowledging author-evaluation bias or having independent evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Latency overhead (+18%) is mentioned but performance is not shown as a function of compute budget. No analysis of how each layer's cost contributes to the overall protection gain."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the 1,000 curated attack scenarios are representative of real-world prompt injection attacks or have construct validity for measuring defense effectiveness."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Three models are tested but it is unclear whether the defense framework was applied identically across all three. No explicit discussion of whether model-specific factors confound the aggregate results."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether the models' safety training already included defenses against these specific attack patterns, which would be a form of temporal leakage for defense evaluation."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup provides information not available in real deployment scenarios."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of independence among the 1,000 attack scenarios. Scenarios 'augmented from GLUE/Alpaca' may share structural patterns that inflate or deflate success rates."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection or prevention method is used or described."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Baseline vendor safeguards provide approximately 58% ± 3.2% protection against prompt injection (42% ASR)",
    364       "evidence": "Stated in abstract and conclusion. Evaluation on GPT-4 (0613), Claude 3 Opus, Gemini 1.5 Pro with 1,000 attack scenarios. No detailed results table is shown.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "State-of-the-art single-layer defenses reach at most ~82% detection but with elevated false positive rates",
    369       "evidence": "Stated in abstract and conclusion, referencing reported detection rates of '67-82%' from [9,14]. No detailed per-defense results shown.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "The proposed six-layer defense-in-depth framework reduces ASR to 3.2% ± 1.1% (92.4% relative reduction over baseline, p<0.001)",
    374       "evidence": "Statistical significance reported (p<0.001, paired t-tests). 95% CIs provided. 3 runs per scenario. However, no detailed results table or per-layer breakdown is shown.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Ablation confirms architectural separation and multi-model verification as key contributors to defense effectiveness",
    379       "evidence": "One-sentence claim in abstract and conclusion. No ablation data, tables, or per-component contribution figures are presented anywhere in the paper.",
    380       "supported": "unsupported"
    381     },
    382     {
    383       "claim": "The defense framework maintains acceptable false-positive rates (~5.3%) and moderate latency (+18% average)",
    384       "evidence": "Stated in conclusion. No breakdown of FP rates by layer or attack type. No latency measurements shown.",
    385       "supported": "weak"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Skeletal results section with no data",
    391       "detail": "The Results section (Section 3) contains only the experimental setup (dataset, models, procedure, stats) but no actual results tables, figures, or detailed findings. All quantitative claims appear only in the abstract and conclusion with no supporting evidence shown in between."
    392     },
    393     {
    394       "flag": "Ablation claimed but not shown",
    395       "detail": "The paper repeatedly claims ablation studies were conducted and names key components, but presents zero ablation data. For a six-layer architecture, showing per-layer contributions is essential."
    396     },
    397     {
    398       "flag": "Scrambled reference list",
    399       "detail": "In-text citations do not match the reference list. For example, '[3] Greshake et al.' in the text refers to the indirect prompt injection paper, but reference [3] is Brown et al. 'Language Models are Few-Shot Learners.' Reference [8] is actually Greshake et al. This suggests careless reference management."
    400     },
    401     {
    402       "flag": "Unexplained dataset provenance",
    403       "detail": "The paper claims 1,000 attack scenarios were 'augmented from GLUE/Alpaca + taxonomy-based attacks.' GLUE and Alpaca are NLP benchmarks/instruction-following datasets, not attack datasets. How they were converted to prompt injection attacks is completely unexplained."
    404     },
    405     {
    406       "flag": "No per-model or per-category results",
    407       "detail": "Three models are tested and 11 attack vectors are taxonomized, but all results are reported as single aggregate numbers. There is no breakdown showing which models are more vulnerable or which attack types are more successful."
    408     },
    409     {
    410       "flag": "Claims significantly outrun evidence",
    411       "detail": "The conclusion makes sweeping claims about 'actionable insights, empirical benchmarks, and a deployable framework' but the paper provides no code, no data, no detailed results, and no deployment guidelines beyond high-level descriptions."
    412     },
    413     {
    414       "flag": "Taxonomy claims 11 vectors but shows 7",
    415       "detail": "The abstract mentions '7 attack vectors' but Section 2.4 claims '11 attack vectors (Table 1).' Table 1 shows only 7 rows. The discrepancy is not acknowledged or explained."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "More than you've asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated Large Language Models",
    421       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    422       "year": 2023,
    423       "arxiv_id": "2302.12173",
    424       "relevance": "Foundational work on indirect prompt injection via external content in LLM-integrated applications."
    425     },
    426     {
    427       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    428       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    429       "year": 2023,
    430       "relevance": "Demonstrates practical indirect prompt injection attacks against real-world LLM applications."
    431     },
    432     {
    433       "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study",
    434       "authors": ["Y. Liu", "G. Deng", "Z. Xu", "Y. Li", "Y. Zheng", "Y. Zhang", "L. Zhao", "T. Zhang", "Y. Liu"],
    435       "year": 2023,
    436       "arxiv_id": "2305.13860",
    437       "relevance": "Empirical study of jailbreaking techniques against ChatGPT through prompt engineering."
    438     },
    439     {
    440       "title": "A Comprehensive Framework for Understanding and Mitigating Prompt Injection Attacks",
    441       "authors": ["W. Liu", "Y. Wang", "X. Chen", "H. Li", "J. Zhang", "Y. Zhao"],
    442       "year": 2024,
    443       "relevance": "Provides a formal framework and mitigation strategies for prompt injection, published at USENIX Security."
    444     },
    445     {
    446       "title": "Adversarial Worms: Self-Replicating Prompt Injection in Multi-Agent LLM Systems",
    447       "authors": ["S. Kumar", "R. Patel", "L. Zhang", "M. Williams"],
    448       "year": 2025,
    449       "arxiv_id": "2501.00234",
    450       "relevance": "Explores emerging self-replicating prompt injection threats in multi-agent LLM systems."
    451     },
    452     {
    453       "title": "On the Opportunities and Risks of Foundation Models",
    454       "authors": ["R. Bommasani", "D.A. Hudson", "E. Adeli"],
    455       "year": 2021,
    456       "arxiv_id": "2108.07258",
    457       "relevance": "Comprehensive analysis of foundation model risks including security vulnerabilities."
    458     },
    459     {
    460       "title": "Language Models are Few-Shot Learners",
    461       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    462       "year": 2020,
    463       "relevance": "Foundational GPT-3 paper demonstrating LLM capabilities that enabled the applications now targeted by prompt injection."
    464     }
    465   ]
    466 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs