ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23392B)


      1 {
      2   "paper": {
      3     "title": "A Multi-Agent LLM Defense Pipeline Against Prompt Injection Attacks",
      4     "authors": [
      5       "S M Asif Hossain",
      6       "Ruksat Khan Shayoni",
      7       "Mohd Ruhul Ameen",
      8       "Akif Islam",
      9       "M. F. Mridha",
     10       "Jungpil Shin"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2509.14285"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL, code archive, or link to implementation is provided anywhere in the paper."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The HPI_ATTACK_DATASET is described but no download link or repository is provided for the 55 attack prompts."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions ChatGLM-6B and Llama2-13B but provides no environment specifications, dependency lists, or setup details."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No reproduction instructions, scripts, or step-by-step guide provided."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Only point estimates (ASR percentages) are reported. No confidence intervals or error bars on any results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims defense reduces ASR from 20-30% to 0% but no statistical significance tests are performed."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "ASR reductions are reported with baseline context: e.g., 'ASR reached 30% for ChatGLM' reduced to '0% across all tested scenarios' (Section V, Table IV). Percentage improvements with baselines provided."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for why 55 attacks or 400 total instances are sufficient. No power analysis or discussion of sample adequacy."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures reported. All results are single-run point estimates."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Undefended systems serve as baselines (Section IV-B), and a taxonomy-based filter serves as a baseline defense (Section IV-C)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No comparison against existing prompt injection defense methods from the literature (e.g., SmoothLLM, Self-Guard, SelfDefend). The only baselines are undefended and a simple rule-based filter."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The paper tests three architectures but does not ablate components within them (e.g., removing the Guard agent, removing the Coordinator). No controlled single-variable manipulation."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Only Attack Success Rate (ASR) is reported. No metrics for false positive rate, latency overhead, legitimate query handling quality, or user experience."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section III-A states 'Two authors independently reviewed all outputs, achieving over 95% agreement, with disagreements resolved through discussion.' This is human evaluation of system outputs."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No separation between development and test sets. The same 55 attacks appear to be used for both designing the defense and evaluating it."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table V provides per-category ASR breakdown across 10 attack categories with baseline and protected rates."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "With 100% mitigation claimed, no failure cases exist to discuss. The paper does not analyze near-misses or edge cases."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "Every experiment shows perfect defense (0% ASR). No configurations that failed or approaches that were tried and abandoned are reported."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims 100% mitigation across 400 evaluations and 0% ASR, which matches Table IV results. The claims are internally consistent."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims the multi-agent pipeline 'achieved 100% mitigation' (causal) but the evaluation does not control for confounds — the defense was designed with knowledge of the attack dataset, creating a circular evaluation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title and abstract frame results broadly ('Prompt Injection Attacks') but testing was limited to 55 hand-crafted attacks on two older models (ChatGLM-6B, Llama2-13B). No discussion of whether results generalize to other models or novel attack types."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No discussion of alternative explanations. The 100% success could be due to the attacks being too simple, the defense being tuned to the specific dataset, or the models being old/small. None of these alternatives are considered."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "ASR measures whether an attack's intended failure mode appears in the output, but the paper frames this as comprehensive 'security' without discussing what ASR does and does not capture (e.g., subtle information leakage, partial compliance, semantic attacks)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "ChatGLM-6B (2022) and Llama2-13B (2023) are specified with model sizes in Section IV-A."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "System prompts for the Coordinator and Guard agents are described functionally (Table III) but the actual prompt text used is not provided."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No temperature, top-p, max tokens, or other inference parameters are reported for either model."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The multi-agent architecture is described in detail with workflow diagrams (Figs 1-3), agent roles (Table III), and the pipeline stages including API Gateway, Event Orchestrator, Coordinator, Guard, and Buffer stages."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "The paper states attacks were 'manually validated and labeled' but does not describe how the 55 attacks were selected, what criteria were used, or how the 400 total instances were generated from the 55 unique attacks."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated limitations or threats-to-validity section. The conclusion mentions 'open challenges remain' in one sentence but this is not a substantive limitations discussion."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No specific threats to validity discussed. The conclusion vaguely mentions 'adaptive adversarial strategies, indirect and multi-turn attacks' but these are future work, not threats to current validity."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit statement of what results do NOT show. The paper does not bound its claims to the specific models or attack types tested."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The 55 attack prompts and model outputs are not released. Only aggregated ASR statistics are provided."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section III-A describes the HPI_ATTACK_DATASET construction with 8 categories, representative examples (Table I), and suite composition (Table II)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants — this is an automated evaluation of LLM defenses against a curated attack dataset."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "It is unclear how 55 unique attacks become 400 evaluation instances. Table II shows 100+50+50=200 for the three suites, but the paper claims 400 total. The multiplication/duplication process is not explained."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding or acknowledgments section present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All author affiliations are listed: Wichita State University, Marshall University, University of Rajshahi, AIUB, University of Aizu."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding information disclosed, so independence cannot be assessed."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This paper tests defense mechanisms against prompt injection, not model knowledge on benchmarks. The attacks are novel hand-crafted prompts, not knowledge benchmarks."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not evaluating pre-trained model capability on a knowledge benchmark — testing defense pipeline effectiveness against injection attacks."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not a benchmark-knowledge evaluation. The concern here is defense effectiveness, not whether models memorized answers."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. Evaluation is automated against a curated attack dataset."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No latency, token cost, or overhead measurements reported despite the multi-agent pipeline requiring multiple LLM calls per query."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No hardware, GPU hours, or computational resources mentioned."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or sensitivity across runs. Results appear to be single-run."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No explicit statement of how many runs produced the results."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search described for the defense pipeline or LLM inference settings."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No description of how the defense configurations were tuned or selected."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "Authors designed both the attacks and defenses and evaluate their own system without acknowledging this bias."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The multi-agent pipeline requires multiple LLM calls per query vs. single-call baseline, but no compute/latency comparison is provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether the 55 hand-crafted attacks are representative of real-world prompt injection threats. The benchmark's construct validity is unexamined."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The defense pipeline itself is scaffolding around the LLMs, but no analysis separates the contribution of the scaffolding from model capabilities. The same scaffold-model bundle is always evaluated together."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the attack patterns in the dataset overlap with patterns seen during model training."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The defense was designed with knowledge of the attack categories it would be tested against — this potential feature leakage (defense tuned to test set) is not discussed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The 400 instances are derived from 55 unique attacks, likely with repetitions. Non-independence between instances within attack categories is not discussed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method applied."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Multi-agent defense pipeline achieves 100% mitigation, reducing ASR to 0% across all 400 evaluations on 55 unique attack types",
    369       "evidence": "Table IV shows 0% ASR for all three defended configurations (v1 Taxonomy ON, Phase2 Coordinator ON, Phase2 Chain ON) across 200 defended evaluations. Section V-A.",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "Baseline (undefended) systems show 20-30% ASR",
    374       "evidence": "Table IV: v1 Taxonomy OFF = 30% ASR (30/100), Phase2 Coordinator OFF = 20% (10/50), Phase2 Chain OFF = 30% (15/50). Section V-A.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Delegate attacks have 100% baseline ASR, making them the most dangerous category",
    379       "evidence": "Table V shows Delegate category: 10 cases, 100% baseline ASR. Section V-B, Fig 6.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Defense success is driven more by comprehensive detection than architectural sophistication",
    384       "evidence": "Section VI states all three architectures achieved identical 0% ASR despite different designs (Table VI). However, this could also indicate the attacks are too simple.",
    385       "supported": "weak"
    386     }
    387   ],
    388   "methodology_tags": ["benchmark-eval"],
    389   "key_findings": "A multi-agent LLM defense pipeline using Coordinator and Guard agents achieves 0% Attack Success Rate across 400 evaluations of 55 prompt injection attacks on ChatGLM-6B and Llama2-13B. Baseline undefended systems showed 20-30% vulnerability, with delegate and role-play attacks being most dangerous. All three defense architectures (rule-based filter, coordinator pipeline, chain-of-agents) achieved identical perfect mitigation, suggesting attack simplicity rather than architectural sophistication drove results.",
    390   "red_flags": [
    391     {
    392       "flag": "Perfect results (100% mitigation)",
    393       "detail": "Achieving exactly 0% ASR across all 400 evaluations with no failures or near-misses is suspicious. Real-world security evaluations almost never achieve perfect scores. This suggests the attack dataset may be too simple or the defense was overfitted to the test set."
    394     },
    395     {
    396       "flag": "Circular evaluation design",
    397       "detail": "The authors designed both the attack dataset and the defense pipeline. The defense was likely tuned with knowledge of the attacks it would face, creating a non-independent evaluation. No held-out attacks were used."
    398     },
    399     {
    400       "flag": "Outdated and small models",
    401       "detail": "ChatGLM-6B (2022) and Llama2-13B (2023) are older, smaller models. Results may not generalize to larger, more capable models like GPT-4 or Claude that are more commonly deployed."
    402     },
    403     {
    404       "flag": "Tiny attack dataset",
    405       "detail": "Only 55 unique attacks across 8 categories is very small for a security evaluation claiming comprehensive coverage. Real prompt injection attacks are diverse and rapidly evolving."
    406     },
    407     {
    408       "flag": "No false positive analysis",
    409       "detail": "The paper reports no metrics on whether the defense incorrectly blocks legitimate queries. A defense that blocks everything would also achieve 0% ASR."
    410     },
    411     {
    412       "flag": "Inconsistent sample counts",
    413       "detail": "Table II shows 25+15+15=55 unique attacks and 100+50+50=200 suite instances, but the paper claims 400 evaluations. The doubling is unexplained — likely running each suite on both models, but this is never explicitly stated."
    414     },
    415     {
    416       "flag": "No comparison with existing defenses",
    417       "detail": "Multiple published prompt injection defenses exist (SmoothLLM, Self-Guard, SelfDefend, PPA) but none are compared against. The only baselines are undefended and a trivial rule-based filter."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    423       "authors": ["F. Liu"],
    424       "year": 2023,
    425       "arxiv_id": "2310.12815",
    426       "relevance": "Systematic taxonomy of prompt injection attacks and defense benchmarking — directly relevant to evaluating LLM security."
    427     },
    428     {
    429       "title": "GenTel-Shield: A model-agnostic prompt injection detector",
    430       "authors": ["S. Li"],
    431       "year": 2024,
    432       "arxiv_id": "2409.00594",
    433       "relevance": "Prompt injection detection tool relevant to LLM security evaluation methodology."
    434     },
    435     {
    436       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    437       "authors": ["K. Greshake"],
    438       "year": 2023,
    439       "relevance": "Seminal work on indirect prompt injection in real LLM applications, key to AI safety research."
    440     },
    441     {
    442       "title": "SmoothLLM: Defending large language models against jailbreaking attacks",
    443       "authors": ["A. Robey"],
    444       "year": 2023,
    445       "arxiv_id": "2310.03684",
    446       "relevance": "Defense mechanism against LLM jailbreaking — directly comparable approach to the surveyed paper."
    447     },
    448     {
    449       "title": "Are aligned neural networks adversarially aligned?",
    450       "authors": ["N. Carlini"],
    451       "year": 2023,
    452       "relevance": "Evaluates robustness of safety-aligned LLMs to adversarial attacks — core AI safety evaluation work."
    453     },
    454     {
    455       "title": "Jailbroken: How does LLM safety training fail?",
    456       "authors": ["A. Wei"],
    457       "year": 2023,
    458       "relevance": "Analysis of failure modes in LLM safety training — foundational for understanding LLM security vulnerabilities."
    459     },
    460     {
    461       "title": "Self-guard: Empower the LLM to safeguard itself",
    462       "authors": ["Y. Wang"],
    463       "year": 2023,
    464       "arxiv_id": "2310.15851",
    465       "relevance": "Self-defensive LLM architecture relevant to multi-agent safety approaches."
    466     },
    467     {
    468       "title": "SelfDefend: LLMs can defend themselves against jailbreaking in a practical manner",
    469       "authors": ["B. Jiang"],
    470       "year": 2023,
    471       "arxiv_id": "2312.00038",
    472       "relevance": "Practical LLM self-defense mechanism against jailbreaking, comparable approach."
    473     },
    474     {
    475       "title": "To protect the LLM agent against prompt injection with polymorphic prompt",
    476       "authors": ["Y. Wang"],
    477       "year": 2024,
    478       "arxiv_id": "2506.05739",
    479       "relevance": "Polymorphic prompt assembly defense against prompt injection — directly relevant defense technique."
    480     },
    481     {
    482       "title": "Universal and transferable adversarial attacks on aligned language models",
    483       "authors": ["A. Zou"],
    484       "year": 2023,
    485       "arxiv_id": "2307.15043",
    486       "relevance": "GCG attack method demonstrating transferable adversarial prompts across LLMs — key attack methodology paper."
    487     }
    488   ]
    489 }

Impressum · Datenschutz