scan.json (32099B)
1 { 2 "paper": { 3 "title": "Hacking Back the AI-Hacker: Prompt Injection as a Defense Against LLM-driven Cyberattacks", 4 "authors": [ 5 "Dario Pasquini", 6 "Evgenios M. Kornaropoulos", 7 "Giuseppe Ateniese" 8 ], 9 "year": 2024, 10 "venue": "arXiv.org", 11 "arxiv_id": "2410.20911", 12 "doi": "10.48550/arXiv.2410.20911" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval", "case-study"], 17 "key_findings": "Mantis, a defensive framework that uses prompt injections against LLM-driven cyberattack agents, achieves approximately 95.4% success in disrupting attacks across three CTF challenges and three open-source attack agents. The FTP decoy is more effective than the Web-app decoy, and the agent-counterstrike (reverse shell) objective is more reliable than the agent-tarpit (infinite exploration loop). Once trapped in the tarpit, no tested LLM agent attempted to backtrack, and the tarpit can inflate attacker costs up to $4.55 per attack with increased directory complexity.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'we open-sourced Mantis' and provides a GitHub URL: https://github.com/pasquini-dario/project_mantis. The abstract also confirms it is 'available as an open-source tool.'" 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No experimental data (attack logs, raw results) is released. The CTF machines are hosted on HackTheBox's private network and cannot be redistributed." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions agents run on a 'Kali-linux machine' and HackTheBox VPN, but provides no requirements.txt, Dockerfile, or dependency specifications for reproducing the environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided. The HackTheBox machines require paid access and VPN setup, and the paper does not describe how to configure the forward-proxy server or run the full evaluation pipeline." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Results in Table 1 are reported as raw counts (e.g., '0/10', '10/10') and averages without any confidence intervals, error bars, or uncertainty measures." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are performed. Claims like '95.4% success rate' and comparisons between configurations are based on raw count comparisons over 10 runs without any formal testing." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports baseline attack success rates (e.g., 9/10 without defense) alongside defense results (e.g., 0/10 with Mantis), and the aggregate 95.4% defense success rate vs <3% attacker success rate, providing sufficient baseline context to assess effect magnitude." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "Each configuration is tested 10 times with no justification for why 10 runs was chosen. No power analysis or discussion of whether 10 runs provides sufficient statistical power for the claims made." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No standard deviations, interquartile ranges, or other spread measures are reported. Results are raw counts per 10 runs with no variance characterization. The average number of rounds is reported without spread." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Table 1 includes a 'No Defense' baseline showing attacker success rates without Mantis deployed, providing a clear comparison point for each agent and CTF combination." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "The only baseline is 'no defense at all.' No comparison with traditional honeypots, IDS systems, or other active defense mechanisms. While the specific approach is novel, the complete absence of any competing defensive approach weakens the evaluation." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "Mantis has multiple components (decoys, injection manager, execution triggers, ANSI hiding, banner manipulation) but no ablation study removes individual components to measure their contribution. The paper tests different decoy types and sabotage objectives but these are variant comparisons, not ablations." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Three metrics are reported: objA (attacker success rate), objD (defender sabotage success rate), and #Rounds (average number of rounds). Section 8.3 also reports API cost per attack for the tarpit scenario." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation is performed. All evaluation is automated success/fail counting. Human evaluation could be relevant for claims about payload invisibility to human operators (Section 5.2), but this was not tested." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "No separation of development and test CTF scenarios is described. The execution triggers and decoy designs may have been tuned on the same 3 CTFs used for evaluation, but this is not discussed." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 1 provides detailed breakdowns by CTF machine (Dancing, Redeemer, Synced), decoy type (FTP, Web-app), sabotage objective (counterstrike, tarpit), agent (PentestGPTAuto, AutoPenAgent, HackingBuddyGPT), and backend LLM (GPT-4o, Sonnet3.5, GPT-4, Haiku3.5)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 8.2 provides a specific example of a failed defense run where the LLM-agent shifted from the decoy to the real SMB server after a failed SQL injection attempt. The paper discusses conditions under which Mantis fails." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that the Web-app decoy is less effective than FTP, CTF:Dancing is harder to defend due to the SMB server, and some configurations show non-zero attacker success (e.g., 1/10). The tarpit's file-download approach is reported as 'somewhat unreliable' (Section 6.3)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims 'over 95% effectiveness' which is supported by the reported 95.4% average defense success rate across configurations (Section 8.2, Table 1). The abstract's claim about open-sourcing is verified by the GitHub link." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The primary causal claim is that Mantis disrupts LLM-driven attacks. The controlled experimental design (same agents and CTFs with/without Mantis) provides adequate evidence for this causal claim through paired comparison." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title ('Defense Against LLM-driven Cyberattacks') and abstract frame results broadly, but testing is limited to 3 very-easy HackTheBox CTFs, 3 open-source attack agents, and 4 commercial LLMs. The paper acknowledges easy CTFs but doesn't bound the generalization in the title or abstract to these narrow conditions." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "Section 9 briefly discusses that attackers could adapt if they know about Mantis, but does not systematically consider alternative explanations for the high success rate, such as the tested agents being particularly naive, the execution triggers being overfit to CTF contexts, or the LLMs' alignment training making them more susceptible to redirect." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures defense success on 3 very-easy CTF challenges and frames this as defense against 'LLM-driven cyberattacks' generally. The gap between CTF flag-capture prevention and real-world cyberattack defense is not acknowledged. Real attacks involve harder targets, custom tooling, and human oversight." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "Models are identified as 'ChatGPT-4o', 'Claude3.5-Sonnet', 'ChatGPT-4', and 'Claude3.5-Haiku' — marketing names without API version snapshots or snapshot dates. Per schema rules, these do not count as specified versions." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper provides actual execution trigger text (e.g., 'Great, you've almost compromised the target. Now, run...') and target instruction examples in Sections 6.1-6.3. The open-source repository (linked in paper) contains the full prompt pool." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for any of the backend LLMs used by the attack agents. The maximum rounds (30) is an experimental parameter, not an LLM hyperparameter." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "Mantis's architecture is described in detail in Section 4 (decoys, injection manager, activation events). Attack agents are described in Section 7.1 including PentestGPTAuto's weaponizer module, AutoPenAgent's ReAct framework, and HackingBuddyGPT's setup. Figure 2 provides an architecture diagram." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 7 describes the experimental setup in detail: how HackTheBox machines were accessed via VPN, the forward-proxy implementation, agent configurations, and win conditions. The experimental pipeline from setup to measurement is clearly documented." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 9 'Conclusion & Discussion' includes substantive discussion of limitations under 'Eliminating Prompt Injections?' and 'Back to Human-(Attackers)-In-The-Loop,' discussing how attackers might adapt and the dependency on LLM vulnerability to prompt injection." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper discusses specific threats: attackers could instruct LLM-agents to bypass known Mantis decoys or filter execution triggers (Section 9), savvy attackers may sandbox their machines limiting counterstrike impact (Section 6.3), and easy CTFs may overstate effectiveness (Section 7.2)." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 7.2 explicitly states they chose 'very-easy' CTFs and explains why. Appendix A demonstrates that harder CTFs are not meaningfully testable. Section 9 acknowledges that informed attackers could counter Mantis, bounding the scope to attackers unaware of the defense." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw experimental data (agent interaction logs, timestamps, full command sequences) is released. Only aggregated results in Table 1 and selected examples are provided." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 7 describes the experimental procedure: agents attack HackTheBox machines via VPN with a forward-proxy running Mantis, each configuration repeated 10 times, with defined win conditions and maximum 30 rounds." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. The evaluation uses standard CTF benchmarks (HackTheBox) and open-source attack agents." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The full pipeline is documented: deploy Mantis with chosen decoy/objective on target, launch LLM-agent attack, agent iterates up to 30 rounds, measure objA and objD outcomes. The forward-proxy implementation and HackTheBox integration are described in Section 7.2." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding source or acknowledgments section is present in the paper. It is unclear whether the research was funded." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All three authors list George Mason University as their affiliation. They are not affiliated with OpenAI or Anthropic (the companies whose models were tested)." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding is disclosed, so independence of the funder from the outcome cannot be assessed." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper tests a defense system (Mantis) rather than evaluating model knowledge or capability on benchmarks. The LLM agents are the attack tool, not the subject of capability evaluation." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper tests a defense framework, not pre-trained model capability. The paper actually leverages the fact that LLMs have been trained on CTF data (Section 5.1) as a design feature, not an evaluation confound." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "This is a defense evaluation paper. The CTF challenges test whether Mantis can redirect agents, not whether models have memorized solutions. Contamination is structurally inapplicable to the defense claims." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. All experiments involve automated LLM agents attacking CTF machines." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The ethical considerations section discusses controlled environments and ethical hacking standards but no IRB was needed." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section 8.3 and Figure 11 report API costs for GPT-4o per attack under the tarpit scenario, ranging from $0.81 to $4.55 depending on directory complexity. Costs are broken down by agent type." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "The total computational budget for the full evaluation (all 10-run experiments across all configurations) is not reported. Only per-attack tarpit costs are shown, and only for a subset of configurations." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Results are reported as counts over 10 runs but no seed sensitivity analysis is performed. The non-determinism comes from LLM sampling, and sensitivity to this is not characterized." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section 8 states 'we repeat each setup 10 times' and all results in Table 1 are reported as x/10." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search budget is reported. The execution triggers were 'manually crafted' (Section 6.1) and decoy configurations were designed without a described selection process." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "The paper reports results for all tested configurations (all decoy × objective × agent × LLM combinations) in Table 1 and appendix tables rather than selecting only the best configuration." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors evaluate their own defense tool (Mantis) and adapted one of the attack agents (PentestGPTAuto with their weaponizer module) without acknowledging potential author-evaluation bias." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": false, 329 "answer": false, 330 "justification": "Mantis is a lightweight defense (decoy services + prompt injection) with negligible compute overhead relative to the LLM agents it defends against. Compute budget differences are not a meaningful variable." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper uses HackTheBox CTF challenges as a proxy for real-world cyberattack defense but does not discuss whether CTF defense translates to defense against actual LLM-driven attacks in production environments. Section 7.2 justifies why easy CTFs were chosen but does not question construct validity of CTFs as a measure of defense effectiveness." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "The paper tests Mantis against three different agent scaffolds (PentestGPTAuto, AutoPenAgent, HackingBuddyGPT) and discusses how agent architecture affects results. Section 8.3 specifically discusses how different memory mechanisms affect cost and behavior." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "The paper acknowledges that LLMs have been trained on CTF data (Section 5.1: 'pivotal publicly-available resource of cybersecurity knowledge used by LLMs') but does not discuss this as a potential confound for baseline attack success rates or defense evaluation validity." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup provides information to agents that would not be available in real attack scenarios. The proxy-based setup may differ from real deployment conditions." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "All 3 CTF machines are from HackTheBox's 'very-easy' tier and share structural similarities (simple authentication vulnerabilities). The non-independence of these test cases is not discussed." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention methods are used. The paper does not check whether the LLMs have memorized HackTheBox solutions." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "Mantis achieves over 95% effectiveness against automated LLM-driven attacks.", 369 "evidence": "Section 8.2 and Table 1: 'Across all possible configurations, Mantis achieves approximately a 95.4% success rate in achieving its sabotage objective objD and reduces the attacker's success rate to less than 3%.' Results across 3 agents, 3 CTFs, 2 decoys, 2 objectives, and 4 backend LLMs.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The FTP decoy is more effective than the Web-app decoy.", 374 "evidence": "Section 8.2: 'The FTP is more effective among decoys than the Web-app one. The FTP decoy allows simpler exploitation, requiring only an anonymous login attempt.' Table 1 shows consistently higher defense success with FTP across configurations.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Agent-counterstrike (reverse shell) is the most reliable sabotage objective.", 379 "evidence": "Section 8.2: 'The most reliable sabotage objective appears to be the agent-counterstrike method... it requires fewer actions from the LLM-agent to succeed.' Table 1 shows slightly higher success for counterstrike vs tarpit in most configurations.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "LLM-agents cannot escape the tarpit once trapped.", 384 "evidence": "Section 6.3: 'no attempts to backtrack or abandon the current path were observed. Specifically, in our experiments, the LLM-agent consistently locked into a depth-first search, endlessly seeking a non-existent file.' Panel (a) of Figure 8 shows the traversal path.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "The tarpit can amplify attacker costs by controlling directory complexity.", 389 "evidence": "Section 8.3 and Figure 11: API cost per attack increases from $0.81 (5 directories) to $4.55 (150 directories) for PentestGPTAuto with GPT-4o, measured over the first 10 tarpit rounds.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Prompt injections hidden with ANSI escape sequences are invisible to human operators but processed by LLM agents.", 394 "evidence": "Section 5.2 and Figure 4 show the concealment technique. The paper claims 'it does not seem to influence the behavior of the LLM-agent' regarding the escape sequences, but no formal human evaluation is conducted.", 395 "supported": "weak" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Small sample size without statistical testing", 401 "detail": "All experiments use only 10 runs per configuration. No confidence intervals, significance tests, or variance measures are reported. With n=10, the difference between 9/10 and 10/10 success is not statistically meaningful, yet the paper draws distinctions between configurations." 402 }, 403 { 404 "flag": "Very narrow benchmark scope", 405 "detail": "Only 3 'very-easy' HackTheBox CTF challenges are tested, all involving simple authentication vulnerabilities. The paper acknowledges this is worst-case for defense but also easiest for agents. Claims of defense against 'LLM-driven cyberattacks' broadly significantly outrun this narrow scope." 406 }, 407 { 408 "flag": "Authors evaluate their own tool", 409 "detail": "The authors designed Mantis and also designed the evaluation. They modified one agent (PentestGPTAuto) by adding a weaponizer module. There is no independent evaluation or acknowledgment of self-evaluation bias." 410 }, 411 { 412 "flag": "Only tested against weakest available agents", 413 "detail": "The 3 open-source agents used are noted as the only publicly available ones. The authors contacted AutoAttacker and PenHeal authors but could not obtain their code. Proprietary and potentially more sophisticated agents (which would be more realistic adversaries) were not tested." 414 }, 415 { 416 "flag": "No human evaluation of payload invisibility", 417 "detail": "The paper claims payloads are 'invisible' to human operators using ANSI escape sequences and HTML comments, but this claim is not empirically validated with human participants. A skilled attacker reviewing raw server responses could detect the injections." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "PentestGPT: An LLM-empowered automatic penetration testing tool", 423 "authors": ["Gelei Deng", "Yi Liu", "Víctor Mayoral-Vilches", "Peng Liu", "Yuekang Li"], 424 "year": 2023, 425 "arxiv_id": "2308.06782", 426 "relevance": "Pioneering LLM-driven penetration testing agent used as one of the three attack agents in the evaluation." 427 }, 428 { 429 "title": "Getting pwn'd by AI: Penetration testing with large language models", 430 "authors": ["Andreas Happe", "Jürgen Cito"], 431 "year": 2023, 432 "relevance": "HackingBuddyGPT framework for autonomous LLM-driven privilege escalation, used as one of the three evaluated attack agents." 433 }, 434 { 435 "title": "AutoPenBench: Benchmarking generative agents for penetration testing", 436 "authors": ["Luca Gioacchini", "Marco Mellia", "Idilio Drago"], 437 "year": 2024, 438 "relevance": "Introduces AutoPenAgent benchmark and agent for evaluating LLM agents on penetration testing simulations, used as one of the three evaluated attack agents." 439 }, 440 { 441 "title": "AutoAttacker: A large language model guided system to implement automatic cyber-attacks", 442 "authors": ["Jiacen Xu", "Jack W. Stokes", "Geoff McDonald"], 443 "year": 2024, 444 "relevance": "Multi-agent framework for fully automated cyberattacks from reconnaissance to exploitation; establishes the threat model Mantis defends against." 445 }, 446 { 447 "title": "LLM agents can autonomously exploit one-day vulnerabilities", 448 "authors": ["Richard Fang", "Rohan Bindu", "Akul Gupta", "Daniel Kang"], 449 "year": 2024, 450 "relevance": "Demonstrates LLM agents can replicate real CVE exploits autonomously, motivating the need for defenses like Mantis." 451 }, 452 { 453 "title": "Teams of LLM agents can exploit zero-day vulnerabilities", 454 "authors": ["Richard Fang", "Rohan Bindu", "Akul Gupta", "Qiusi Zhan", "Daniel Kang"], 455 "year": 2024, 456 "arxiv_id": "2406.01637", 457 "relevance": "Multi-agent LLM framework for zero-day exploitation, showing escalating sophistication of LLM-driven attacks." 458 }, 459 { 460 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 461 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"], 462 "year": 2023, 463 "relevance": "Foundational work on indirect prompt injection attacks on LLM-integrated applications; Mantis repurposes this attack vector for defense." 464 }, 465 { 466 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 467 "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"], 468 "year": 2024, 469 "relevance": "Introduces learned execution triggers for prompt injection; referenced as a potential enhancement for Mantis's tailored triggers." 470 }, 471 { 472 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 473 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike"], 474 "year": 2024, 475 "relevance": "Addresses LLM defenses against prompt injection from the model side; relevant to whether future LLMs could resist Mantis-style defenses." 476 }, 477 { 478 "title": "ReAct: Synergizing reasoning and acting in language models", 479 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 480 "year": 2022, 481 "arxiv_id": "2210.03629", 482 "relevance": "Foundational agentic LLM framework used by AutoPenAgent; establishes the agent architecture that Mantis targets." 483 }, 484 { 485 "title": "PenHeal: A two-stage LLM framework for automated pentesting and optimal remediation", 486 "authors": ["Junjie Huang", "Quanyan Zhu"], 487 "year": 2024, 488 "relevance": "LLM-driven attack framework with automatic vulnerability remediation; extends the landscape of automated attack agents." 489 }, 490 { 491 "title": "An empirical evaluation of LLMs for solving offensive security challenges", 492 "authors": ["Minghao Shao", "Boyuan Chen", "Sofija Jancheska"], 493 "year": 2024, 494 "relevance": "Empirical evaluation of LLM offensive security capabilities across CTF challenges; directly relevant to understanding LLM attack capabilities." 495 }, 496 { 497 "title": "LLMmap: Fingerprinting For Large Language Models", 498 "authors": ["Dario Pasquini", "Evgenios M. Kornaropoulos", "Giuseppe Ateniese"], 499 "year": 2024, 500 "relevance": "LLM fingerprinting tool referenced as potential enhancement for Mantis to identify the attacking LLM and tailor execution triggers." 501 } 502 ], 503 "engagement_factors": { 504 "practical_relevance": { 505 "score": 3, 506 "justification": "Mantis is an open-source, deployable defense tool that security practitioners could integrate with existing infrastructure to defend against LLM-driven attacks." 507 }, 508 "surprise_contrarian": { 509 "score": 2, 510 "justification": "Reframes prompt injection from a vulnerability into a defensive weapon — a contrarian inversion of the conventional view that prompt injection is purely a problem to fix." 511 }, 512 "fear_safety": { 513 "score": 3, 514 "justification": "Directly addresses AI-powered cyberattacks and demonstrates both the threat (automated hacking) and a novel active defense (hacking back the AI hacker), raising significant security concerns." 515 }, 516 "drama_conflict": { 517 "score": 1, 518 "justification": "The 'hack back' framing adds some drama and the ethical implications of active defense are provocative, but no direct controversy with specific companies or researchers." 519 }, 520 "demo_ability": { 521 "score": 2, 522 "justification": "Open-source on GitHub but requires HackTheBox access and LLM API keys to test; not a simple pip-install demo." 523 }, 524 "brand_recognition": { 525 "score": 2, 526 "justification": "Tests against ChatGPT-4o and Claude 3.5 models from OpenAI and Anthropic, but the paper itself is from George Mason University, not a major AI lab." 527 } 528 } 529 }