ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24276B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GUARDIAN: A Multi-Tiered Defense Architecture for Thwarting Prompt Injection Attacks on LLMs",
      6     "authors": [
      7       "Parijat Rai",
      8       "Saumil Sood",
      9       "V. Madisetti",
     10       "Arshdeep Bahga"
     11     ],
     12     "year": 2024,
     13     "venue": "Journal of Software Engineering and Applications",
     14     "arxiv_id": null,
     15     "doi": "10.4236/jsea.2024.171003"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims 100% blocking of attack prompts without disclosing this was on 50 self-crafted prompts; the critical caveat that results may not generalize to other attacks appears only as a disclaimer buried in section 6.3, not in the abstract.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper claims each filter layer causally improves blocking rates, but the test prompts were self-crafted by the same authors to exploit known weaknesses, making the causal improvement circular rather than evidence of generalizable defense effectiveness.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title and conclusions speak broadly about 'thwarting prompt injection attacks on LLMs,' but evaluation is limited to 50 hand-crafted prompts against a single model variant (Llama-2-7b-chat); no explicit boundary restricts the headline claim to this narrow setting.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No discussion of whether 100% blocking reflects overfitting to self-crafted prompts, what false positive rate on benign queries looks like, or whether attackers aware of the architecture could trivially bypass it.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Blocking rate on 50 custom prompts is equated with 'defense effectiveness against prompt injection attacks' without acknowledging that this custom dataset is not representative of real-world attack distributions.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated limitations or threats-to-validity section exists; the only acknowledgment is a single-sentence disclaimer in section 6.3 and passing remarks in the Conclusions.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No specific threats are identified; the sole acknowledgment is a generic statement that 'more sophisticated adversarial prompts could still penetrate our defenses,' with no discussion of dataset size, single-model scope, or false positive risk.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly bound results to Llama-2-7b, to 50 self-crafted prompts, or to any particular attack distribution; the disclaimer in 6.3 is one sentence and not framed as a scope statement.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source is mentioned anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations (Bennett University, Georgia Institute of Technology, Cloudemy Technology Labs) are clearly disclosed on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper explicitly states 'The authors declare no conflicts of interest regarding the publication of this paper.'",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "'Prompt injection,' 'jailbreaking,' and 'smaller LLMs' are used throughout without formal definition; 'smaller LLMs' is a core framing term that is never precisely characterized with parameter counts or other criteria.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states its contribution: a three-tiered defense framework (system prompt filter, pre-processing filter, pre-display filter) with an ethical prompt auto-suggestion feature.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 3 reviews 12 relevant papers and explicitly identifies Helbling et al. as the direct baseline that the third filter extends, showing how GUARDIAN builds on prior work rather than merely listing references.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No source code is released or linked; the paper describes the architecture and training process in prose without providing any code repository.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The custom 50-prompt adversarial test set and the generated ethical prompt dataset are not released; only partial illustrative examples appear in the appendix.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Libraries are mentioned by name (PyTorch, HuggingFace Transformers, bitsandbytes, etc.) and hardware (RTX A6000 GPU) is noted, but no requirements file, Dockerfile, or versioned dependency list is provided.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper describes the pipeline in narrative form but without code, data releases, or step-by-step instructions sufficient to reproduce the experiments.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results (40%, 60%, 100% blocking rates) are reported as single numbers with no confidence intervals, error bars, or repeated trials.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any comparative claims, including the Zephyr vs. FLAN-T5 comparison or the per-layer blocking rate differences.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements are reported at each filter stage (0%→40%→60%→100%) with the explicit no-defense baseline stated, providing context for effect magnitude.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The 50-prompt test dataset is not justified; no power analysis or argument for why 50 prompts is sufficient for evaluating defense robustness is provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No variance, standard deviation, or repeated run statistics are reported for any results.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The undefended Llama-2-7b-chat (0% blocking rate) serves as the explicit baseline against which each defense layer is measured.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The only baseline is the undefended model; despite reviewing SmoothLLM, RA-LLM, Self-Reminder, and others in related work, none are empirically compared against GUARDIAN.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Tables 1–3 present results for each layer individually and cumulatively, effectively ablating each filter's contribution to the overall 100% blocking rate.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Defense evaluation uses blocking rate; the generation model evaluation uses validation loss, ROUGE (for FLAN-T5), and perplexity, covering multiple performance aspects.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not included and not clearly relevant to evaluating an automated defense architecture's blocking behavior.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": false,
    211           "justification": "All 50 adversarial prompts were custom-crafted by the authors and used directly as the test set; there is no held-out set from a realistic or independently sourced attack distribution.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Tables 1–3 provide per-base-prompt (BP1–BP5) and per-sub-prompt (SP1–SP10) breakdowns of blocking results at each filter layer.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": false,
    223           "justification": "The paper does not analyze cases where the defense fails; the disclaimer in section 6.3 acknowledges failure is possible but provides no examples, analysis, or characterization of failure modes.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": false,
    229           "justification": "No negative results are reported; the paper presents only the success of each filter layer with no discussion of where the system falls short on any prompt type.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model variants are named throughout: Llama-2-7b-chat, bert-base-uncased, Zephyr-7B-α, FLAN-T5-Large; version snapshot dates are absent but specific checkpoint names are given.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Example adversarial prompts and outputs are shown in Appendix A.1–A.5; the system prompt addition text ('Keep in mind the ethical boundaries') is explicitly stated; filter 3's LLM query format is described.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Training parameters are mentioned (epochs, batch size, AdamW optimizer, LoRA alpha/r/dropout) but specific numeric values for key hyperparameters such as learning rate are not provided.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The three-layer architecture is described in detail in section 5, including how each filter operates, what inputs it receives, and how decisions are passed between layers.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "The Jigsaw dataset was undersampled from ~230K to ~32K entries for class balance; the ethical prompt dataset filtered toxicity scores 1–4; train/val/test splits (70%/15%/15%) are stated.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The custom 50-prompt adversarial test set and generated ethical prompt dataset are not released; only a few illustrative examples appear in the appendix.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The adversarial prompt creation process (trial and error combining role-emulation, manipulative assistance, and alternative reality strategies) and Jigsaw dataset sourcing from Kaggle are described.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants were recruited; the study uses model-generated and publicly available data.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The pipeline is described in narrative form but without code; the exact steps from raw Jigsaw data through fine-tuning to adversarial testing are not sufficiently documented for independent reproduction.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "The paper evaluates defense effectiveness against custom adversarial prompts, not model knowledge on standard benchmarks, making training cutoff not applicable.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Some base prompts are 'adaptations of jailbreak prompts available in the public domain' that may have appeared in Llama-2's training data; this potential overlap is never discussed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "No standard benchmark is used; the evaluation dataset is entirely custom-crafted by the authors.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants; the ethics section discusses research conduct principles, not IRB oversight.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "Generation time is mentioned as a metric in section 4.4 but no actual latency or throughput figures for the defense filters are reported anywhere in the results.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Hardware (RTX A6000 GPU) is mentioned but total training time, GPU-hours, or compute cost is not stated.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "The GUARDIAN three-tiered defense achieves 100% blocking of adversarial prompts against Llama-2-7b.",
    374       "evidence": "Table 3 shows all 50 prompts (5 base × 10 sub-prompts) blocked after applying all three filters cumulatively.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "Filter 1 (system prompt addition) alone blocks 40% of adversarial prompts.",
    379       "evidence": "Table 1 shows BP1 and BP2 (20 of 50 prompts) blocked; text states 40% blocking accuracy after the first filter.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Filter 2 (BERT toxic classifier + ethical prompt generator) raises cumulative blocking to 60%.",
    384       "evidence": "Section 6.2 states 30/50 prompts blocked after filter 2; Table 2 shows additional prompts blocked beyond those caught by filter 1.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Zephyr-7B-α outperforms FLAN-T5-Large for ethical prompt generation.",
    389       "evidence": "Tables 4 and 5 show Zephyr perplexity of 2.98–3.18 versus FLAN-T5 perplexity of 5.66–7.31 across configurations.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "A multi-layered approach is superior to single-filter defense systems.",
    394       "evidence": "No empirical comparison with existing single-filter systems (SmoothLLM, Self-Reminder, RA-LLM) is provided; the claim rests only on no single layer achieving 100% on their own custom dataset.",
    395       "supported": "unsupported"
    396     }
    397   ],
    398   "methodology_tags": [
    399     "benchmark-eval",
    400     "case-study"
    401   ],
    402   "key_findings": "GUARDIAN achieves 100% blocking of 50 hand-crafted adversarial prompts against Llama-2-7b using a three-layer defense: system prompt modification (40% blocking), BERT-based toxicity classifier with ethical prompt generation (cumulative 60%), and LLM self-evaluation of outputs (100% total). The Zephyr-7B-α model outperforms FLAN-T5-Large for ethical prompt generation by perplexity (~3.0 vs. ~6.0). All evaluation is conducted on a self-crafted dataset with no comparison against existing defense systems and no false positive analysis, severely limiting validity of the 100% success headline.",
    403   "red_flags": [
    404     {
    405       "flag": "Circular evaluation",
    406       "detail": "Authors crafted both the 50-prompt attack dataset and the defense, then evaluated defense on their own attacks; this guarantees high blocking rates but provides no evidence of real-world generalizability."
    407     },
    408     {
    409       "flag": "Trivially small test set",
    410       "detail": "Only 50 adversarial prompts (5 base × 10 sub-prompts) are used for evaluation; this is insufficient to draw conclusions about defense robustness against a realistic attack distribution."
    411     },
    412     {
    413       "flag": "No false positive analysis",
    414       "detail": "The paper never reports how often the defense incorrectly blocks legitimate benign queries, making the 100% blocking rate uninterpretable as a practical security metric."
    415     },
    416     {
    417       "flag": "No comparison with existing defenses",
    418       "detail": "Despite reviewing SmoothLLM, RA-LLM, Self-Reminder, and MASTERKEY in related work, none are empirically compared against GUARDIAN, so the claimed superiority of multi-layer defense is asserted, not demonstrated."
    419     },
    420     {
    421       "flag": "Headline overstated in abstract",
    422       "detail": "The abstract claims 100% blocking without any qualifier; the critical caveat ('may not hold for all jailbreak prompts') appears only in a disclaimer in section 6.3."
    423     },
    424     {
    425       "flag": "Predatory venue",
    426       "detail": "Published in SCIRP's Journal of Software Engineering and Applications, associated with predatory publishing practices, suggesting limited peer review rigor."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked",
    432       "relevance": "Direct baseline for GUARDIAN's third filter; proposes LLM self-evaluation as a harm detection mechanism that this paper extends."
    433     },
    434     {
    435       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    436       "relevance": "Formalizes prompt injection attack taxonomy and systematic defenses; closely related to GUARDIAN's problem framing."
    437     },
    438     {
    439       "title": "SmoothLLM: Defending Large Language Models against Jailbreaking Attacks",
    440       "relevance": "Competing defense approach using input perturbation and prediction aggregation; reviewed but not empirically compared."
    441     },
    442     {
    443       "title": "Defending against Alignment-Breaking Attacks via Robustly Aligned LLM",
    444       "relevance": "Alternative defense using robust alignment checking; related approach to GUARDIAN's multi-filter strategy."
    445     },
    446     {
    447       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    448       "relevance": "Analyzes fundamental vulnerabilities in LLM safety training that motivate the attack surface GUARDIAN addresses."
    449     },
    450     {
    451       "title": "Do Anything Now: Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    452       "relevance": "Comprehensive characterization of real-world jailbreak prompts relevant to the attack strategies used in GUARDIAN's dataset."
    453     },
    454     {
    455       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    456       "relevance": "The target model being defended; describes the safety mechanisms GUARDIAN supplements."
    457     }
    458   ],
    459   "engagement_factors": {
    460     "practical_relevance": {
    461       "score": 2,
    462       "justification": "The three-layer defense pattern is directly applicable to LLM deployments, though lack of released code reduces immediate adoptability."
    463     },
    464     "surprise_contrarian": {
    465       "score": 0,
    466       "justification": "The finding that multi-layer defenses outperform single-layer defenses is expected and not contrarian."
    467     },
    468     "fear_safety": {
    469       "score": 2,
    470       "justification": "Prompt injection and jailbreaking are genuine AI security concerns receiving practitioner attention, though no novel threats are raised."
    471     },
    472     "drama_conflict": {
    473       "score": 1,
    474       "justification": "The attacker-defender arms race framing provides mild drama but no controversy or surprising conflict."
    475     },
    476     "demo_ability": {
    477       "score": 1,
    478       "justification": "The architecture uses publicly available models but no code or demo is provided, requiring significant reimplementation effort."
    479     },
    480     "brand_recognition": {
    481       "score": 1,
    482       "justification": "Georgia Tech co-authorship provides some credibility but the work is not from a major AI safety or industry lab."
    483     }
    484   },
    485   "hn_data": {
    486     "threads": [],
    487     "top_points": 0,
    488     "total_points": 0,
    489     "total_comments": 0
    490   }
    491 }

Impressum · Datenschutz