ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31613B)


      1 {
      2   "paper": {
      3     "title": "Prompt Infection: LLM-to-LLM Prompt Injection within Multi-Agent Systems",
      4     "authors": ["Donghyun Lee", "Mo Tiwari"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2410.07283",
      8     "doi": "10.48550/arXiv.2410.07283"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Prompt Infection demonstrates that self-replicating malicious prompts can propagate across agents in multi-agent systems, with self-replicating attacks outperforming non-replicating ones by 14-209% in global messaging. GPT-4o is more resistant to infection but more dangerous when compromised due to higher execution precision. Infection spreads in logistic growth patterns in social simulations and can exploit memory retrieval systems by manipulating importance scores. The proposed LLM Tagging defense, combined with Marking, reduces attack success to 0%, though neither is effective in isolation.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper. The full attack prompt is shown in Appendix A, but no experimental code or simulation framework is released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions creating 'a dataset of 120 user instructions across three tool types' paired with synthetic documents (Section 4.1), but this dataset is not released or made publicly available."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specification, dependency list, or system requirements are provided. The paper mentions using OpenAI's GPT models but provides no setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions, scripts, or step-by-step guide are provided. The attack prompt is shown in Appendix A, but the full experimental setup (multi-agent simulation, evaluation pipeline) is not documented at a reproducible level."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., '13.92% higher success rate', '209% more effective'). No confidence intervals, error bars, or uncertainty measures are provided in any figure or table."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes multiple comparative claims (GPT-4o vs GPT-3.5, self-replicating vs non-replicating, various defenses) based solely on comparing raw percentages without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports relative differences with baseline context: 'Self-Replicating infection achieves a 13.92% higher success rate' for GPT-4o, '209% more effective' for GPT-3.5 (Section 5.1). Defense results in Figure 7 show absolute success rates before and after applying LLM Tagging."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The dataset of 120 user instructions (360 pairs total) is stated without justification for why this number was chosen. No power analysis or discussion of statistical adequacy."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance, or spread measures are reported. The importance score experiment (Table 1) reports averages 'over 100 runs' but gives no variance. All other results appear to be single-run."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "A Non-Replicating Prompt Infection baseline is established (Section 4.1): the infection lacks self-replication, with the malicious prompt instructing the agent to 'say perform A.' This allows direct comparison of self-replication's impact."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The only baseline is the authors' own non-replicating variant. No comparison to other MAS attack methods from prior work (e.g., Zhang et al. 2024a, Gu et al. 2024, Cohen et al. 2024's AI worm), despite discussing them in related work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablation-style comparisons: self-replicating vs non-replicating infection, global vs local messaging, with vs without importance score manipulation (Table 1, Figures 6a-b), and individual defense mechanisms with and without LLM Tagging (Figure 7)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper evaluates attack success rate across multiple threat types (data theft, scam, malware, content manipulation), analyzes failure categories (Attack Ignored, Mixed Action, Deformed Infection, No Action, Agent Error in Figure 5), and measures infection propagation dynamics (Figure 6)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated or based on predefined success criteria. No human evaluators were used to assess whether attacks or defenses succeeded. Success criteria are described (Section 4.1) but it is unclear if assessment was automated or manual."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No discussion of separating development and test data. The attack prompts were presumably developed and tested on the same 360 pairs. No held-out evaluation set is mentioned."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by messaging type (global vs local, Figures 4a-b), model (GPT-4o vs GPT-3.5), threat type (data theft, scam, malware, content manipulation), defense strategy (Figure 7), failure category (Figure 5), and population size (Figure 6)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 5 provides a detailed analysis of failure reasons across categories: Attack Ignored, Mixed Action, Deformed Infection, No Action, and Agent Error, with comparative analysis between GPT-4o and GPT-3.5 in both infection modes."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results are reported: self-replicating infection is outperformed by non-replicating for data theft with many agents (Section 5.1), no individual defense strategy is effective alone (Section 6), LLM Tagging alone reduces success rate by only 5% (Section 6)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims — MAS are 'highly susceptible', infection spreads 'even when agents do not publicly share all communications' (local messaging experiments), LLM Tagging 'when combined with existing safeguards, significantly mitigates infection spread' (Marking + Tagging achieves 0% in Figure 7) — are all supported by experimental results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims (e.g., 'self-replication enables scalable attacks') are supported by controlled experimental comparisons: same setup with and without self-replication, same models with and without importance score manipulation. The experimental design involves controlled single-variable manipulation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only GPT-4o and GPT-3.5 Turbo with a single MAS architecture, but the title and abstract claim broadly about 'multi-agent systems.' While Section 7 acknowledges 'Our experiments focused on the GPT family, leaving other LLMs like Claude, Llama, and Gemini underexplored,' the main claims are not qualified to the tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the observed results. For example, why GPT-4o is more resistant could relate to RLHF differences, safety training intensity, or instruction following fidelity — none of these are explored. The Section 7 limitations discuss coverage gaps but not confounds."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures 'attack success rate' and defines it precisely for each threat type (Section 4.1): data theft requires three agents compromised with a POST request generated; scam/malware/manipulation requires final agent producing malicious output while concealing the prompt. Claims match the granularity of measurements."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper uses 'GPT-4o' and 'GPT-3.5 Turbo' (Section 4.1) without specifying version snapshots or API dates. Per schema, marketing names like 'GPT-4o' without a snapshot date do not count as specified versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full functional self-replication infection prompt is provided in Appendix A (Figure 8), including the actual text used. The defense prompts (Marking, Sandwich, etc.) are described by name with references to prior work."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No temperature, top-p, max_tokens, or other API parameters are reported for any of the GPT model calls used in experiments."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent application structure is described: first agent is tool-specific, followed by strategist, summarizer, editor, and writer (Section 4.1). Communication methods (global and local messaging) are explained. The society of agents setup describes memory retrieval with top-K=3 selection based on importance, recency, and relevance scores."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The dataset creation process is described at high level (120 user instructions, 3 tool types, synthetic documents), but the specific process for generating user instructions, the synthetic data generation pipeline, and the web document injection methodology are not documented in sufficient detail to reproduce."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Limitations and Future Work' provides a dedicated discussion of the study's limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7 discusses specific threats: experiments limited to GPT family, only basic MAS architectures tested, handcrafted attacks only (algorithmically generated prompts could bypass defenses), Claude preliminary tests incomplete due to computational costs, attack prompts are exposed in MAS offering detection opportunities."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 explicitly states: other LLMs are underexplored, only basic MAS architectures tested, handcrafted attacks used (not automated prompt generation), stealthier methods needed. These are specific boundaries of what was not tested."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data, experimental logs, or detailed results are made available. Only aggregate figures and tables are presented."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The data creation process is described briefly — '120 user instructions across three tool types, paired with synthetic PDFs and emails' and 'synthetic user data stored in a CSV file' (Section 4.1) — but the actual generation methodology, selection criteria, and representative examples (beyond the attack prompt) are not provided."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is entirely synthetic."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from synthetic data generation to final attack success metrics is not documented step by step. How user instructions were paired with attack prompts, how success was determined programmatically, and the intermediate processing steps are not described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. No grants, sponsors, or funding sources are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Donghyun Lee (University College London) and Mo Tiwari (Stanford University). These are academic institutions with no product conflict regarding the evaluated OpenAI models."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding statement is noted."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests a security attack (prompt injection propagation), not model knowledge on a benchmark. The evaluation measures attack success rates, not the models' pre-trained capabilities. Contamination is structurally inapplicable."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper evaluates a novel attack/defense mechanism, not pre-trained model capability on benchmark tasks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — the evaluation is about prompt injection propagation success, not model knowledge that could be contaminated by training data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. All experiments use LLM agents and synthetic data."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The ethical statement discusses responsible disclosure but no IRB review was needed or conducted."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token counts, or latency figures are reported despite extensive use of GPT-4o and GPT-3.5 Turbo across 360+ attack scenarios and multi-agent society simulations with up to 50 agents."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget, API spend, or hardware requirements are stated. Section 7 mentions 'computational costs' preventing full Claude experiments, but the actual budget for GPT experiments is not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or sensitivity analysis. The society simulation involves random pairwise dialogues but results are not reported across multiple random seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The importance score experiment states 'averaged over 100 runs' (Table 1), but for the main attack experiments (360 pairs) and society simulations, the number of runs is not stated. It is unclear whether results represent single runs or averages."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. The attack prompts, dataset size, population sizes, and K=3 memory retrieval parameter appear to be set without documented search or justification."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The infection prompt shown in Appendix A appears to be a refined artifact, but the paper does not describe how it was developed or selected from alternatives. No selection methodology is discussed."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes many comparisons (2 models × 2 infection modes × 2 messaging modes × 4 threat types, plus 6 defense strategies) without any multiple comparison correction."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own attack method and defense mechanism. No acknowledgment of author-evaluation bias, no independent evaluation, and no mitigation strategies discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Self-replicating infection requires each agent to replicate the full prompt, increasing token consumption vs non-replicating. This compute difference is not quantified or discussed as a confound."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether their simulated multi-agent system reflects real-world MAS deployments. The simulated environment (tool-specific first agent → strategist → summarizer → editor → writer) may not represent actual production architectures, but this validity gap is not addressed."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "GPT-4o and GPT-3.5 Turbo are compared within the same multi-agent scaffold, isolating the model variable. Both global and local messaging modes are tested for each model. The scaffold is held constant across model comparisons."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether GPT models may have encountered similar prompt injection patterns or defense techniques in their training data, which could affect their resistance/susceptibility rates."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides signals not available in real-world scenarios (e.g., the simulated environment may be simpler than production systems, making attacks artificially easier or harder)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The 120 user instructions and their pairing with 3 tool types may share structural patterns. No discussion of independence between test cases or potential correlation in the 360 pairs."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used or discussed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Self-replicating infection consistently outperforms non-replicating infection in most attack scenarios",
    365       "evidence": "Figure 4a shows 13.92% higher success rate for GPT-4o and 209% improvement for GPT-3.5 in global messaging for scam, malware, and content manipulation. Exception: data theft with many agents (Section 5.1).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Self-replicating infection is the only scalable method for compromising more than two agents in local messaging",
    370       "evidence": "Figure 4b shows non-replicating infection 'struggles to compromise more than two agents' in local messaging, while self-replicating infection achieves ~20% lower but still meaningful success rates (Section 5.1).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Stronger models (GPT-4o) are not inherently safer — they are more resistant to infection but more dangerous when compromised",
    375       "evidence": "Figure 5 shows GPT-4o ignores 66% of self-replicating attacks vs 9% for GPT-3.5, but GPT-4o has fewer 'Deformed Infection' and 'Mixed Action' failures, executing malicious tasks more precisely when compromised (Section 5.1).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Infection spreads in a logistic growth pattern in social simulations, and larger populations experience more efficient per-agent infection rates",
    380       "evidence": "Figure 6a shows full infection achieved at ~47% of total turns for 10 agents, decreasing to ~21.4% for 50 agents (Section 5.2).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Manipulating the importance scoring system creates a feedback loop that amplifies infection persistence and spread",
    385       "evidence": "Table 1 shows importance scores jump from 1.94 to 10.00 (GPT-4o) and 1.00 to 9.84 (GPT-3.5) with manipulation. Figure 6b shows without manipulation, infection dies out after K=3 turns (Section 5.2).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Marking + LLM Tagging prevents all attacks, while neither defense is effective alone",
    390       "evidence": "Figure 7 shows Marking + LLM Tagging achieves 0% attack success rate. LLM Tagging alone reduces success by only 5%. Marking alone still permits 76% of attacks after counterattack (Section 6).",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No uncertainty quantification",
    397       "detail": "All results are reported as point estimates without confidence intervals, error bars, or variance across runs. For the society simulation with stochastic pairings, this is a significant omission — results could vary substantially across random seeds."
    398     },
    399     {
    400       "flag": "Small synthetic evaluation dataset",
    401       "detail": "The evaluation uses only 120 user instructions (360 pairs), all synthetically generated. No analysis of whether these represent realistic multi-agent usage patterns or whether results would hold on organic interactions."
    402     },
    403     {
    404       "flag": "Defense evaluation uses only handcrafted attacks",
    405       "detail": "The paper acknowledges in Section 7 that 'algorithmically generated prompts can bypass such defenses,' yet the defense evaluation (Section 6) uses only the authors' handcrafted attack prompt. The 0% success rate for Marking + LLM Tagging may not generalize to adaptive adversaries."
    406     },
    407     {
    408       "flag": "Broad claims from narrow evaluation",
    409       "detail": "Only GPT-4o and GPT-3.5 Turbo are tested in one simulated MAS architecture, but claims extend to 'multi-agent systems' broadly. Preliminary Claude tests mentioned but incomplete."
    410     },
    411     {
    412       "flag": "No artifacts released",
    413       "detail": "Despite the paper publishing the full functional attack prompt in Appendix A, no code, data, or evaluation framework is released. The experimental simulation is not reproducible from the paper alone."
    414     },
    415     {
    416       "flag": "Construct validity of simulated MAS",
    417       "detail": "The simulated multi-agent application (tool-reader → strategist → summarizer → editor → writer) may not reflect production MAS architectures. Real systems may have safety layers, monitoring, or different communication patterns not captured in the simulation."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    423       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    424       "year": 2023,
    425       "arxiv_id": "2302.12173",
    426       "relevance": "Foundational work on indirect prompt injection in LLM applications, which this paper extends to multi-agent systems."
    427     },
    428     {
    429       "title": "Here Comes The AI Worm: Unleashing Zero-click Worms that Target GenAI-Powered Applications",
    430       "authors": ["Stav Cohen", "Ron Bitton", "Ben Nassi"],
    431       "year": 2024,
    432       "arxiv_id": "2403.02817",
    433       "relevance": "Introduced self-propagating AI worm concept for single-agent LLMs via email; closely related attack vector to Prompt Infection."
    434     },
    435     {
    436       "title": "Agent Smith: A Single Image Can Jailbreak One Million Multimodal LLM Agents Exponentially Fast",
    437       "authors": ["Xiangming Gu", "Xiaosen Zheng", "Tianyu Pang", "Chao Du", "Qian Liu", "Ye Wang", "Jing Jiang", "Min Lin"],
    438       "year": 2024,
    439       "arxiv_id": "2402.08567",
    440       "relevance": "Demonstrates exponential jailbreak propagation through multimodal MAS agents via adversarial images."
    441     },
    442     {
    443       "title": "Breaking Agents: Compromising Autonomous LLM Agents Through Malfunction Amplification",
    444       "authors": ["Boyang Zhang", "Yicong Tan", "Yun Shen", "Ahmed Salem", "Michael Backes", "Savvas Zannettou", "Yang Zhang"],
    445       "year": 2024,
    446       "arxiv_id": "2407.20859",
    447       "relevance": "Studies prompt injection attacks in MAS without self-replication; directly compared to Prompt Infection in this paper."
    448     },
    449     {
    450       "title": "Ignore Previous Prompt: Attack Techniques For Language Models",
    451       "authors": ["Fábio Perez", "Ian Ribeiro"],
    452       "year": 2022,
    453       "arxiv_id": "2211.09527",
    454       "relevance": "Foundational work demonstrating prompt injection vulnerabilities in GPT-3, establishing the attack class studied here."
    455     },
    456     {
    457       "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting",
    458       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    459       "year": 2024,
    460       "arxiv_id": "2403.14720",
    461       "relevance": "Proposes Marking and Delimiting Data defenses used as baselines in this paper's defense evaluation."
    462     },
    463     {
    464       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    465       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    466       "year": 2024,
    467       "arxiv_id": "2402.06363",
    468       "relevance": "Proposes a finetuning-based defense against prompt injection, representing the class of model-level defenses not applicable to black-box models."
    469     },
    470     {
    471       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    472       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Beibin Li", "Erkang Zhu", "Li Jiang", "Xiaoyun Zhang", "Shaokun Zhang", "Jiale Liu", "Ahmed Hassan Awadallah", "Ryen W. White", "Doug Burger", "Chi Wang"],
    473       "year": 2023,
    474       "arxiv_id": "2308.08155",
    475       "relevance": "Major multi-agent framework whose adoption motivates the security research in this paper."
    476     },
    477     {
    478       "title": "Generative Agents: Interactive Simulacra of Human Behavior",
    479       "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S. Bernstein"],
    480       "year": 2023,
    481       "arxiv_id": "2304.03442",
    482       "relevance": "Introduces the memory retrieval system (importance + recency + relevance) exploited by Prompt Infection in the social simulation experiments."
    483     },
    484     {
    485       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    486       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    487       "year": 2024,
    488       "arxiv_id": "2310.12815",
    489       "relevance": "Provides formal framework and benchmark for prompt injection attacks and defenses in single-agent systems."
    490     },
    491     {
    492       "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically",
    493       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"],
    494       "year": 2024,
    495       "arxiv_id": "2312.02119",
    496       "relevance": "Demonstrates automated generation of adversarial prompts that could bypass the defenses proposed in this paper."
    497     },
    498     {
    499       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    500       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    501       "year": 2023,
    502       "relevance": "Demonstrates universal adversarial attacks on aligned LLMs; cited as evidence that findings may generalize across models."
    503     },
    504     {
    505       "title": "PsySafe: A Comprehensive Framework for Psychological-based Attack, Defense, and Evaluation of Multi-agent System Safety",
    506       "authors": ["Zaibin Zhang", "Yongting Zhang", "Lijun Li", "Hongzhi Gao", "Lijun Wang", "Huchuan Lu", "Feng Zhao", "Yu Qiao", "Jing Shao"],
    507       "year": 2024,
    508       "arxiv_id": "2401.11880",
    509       "relevance": "Examines psychological manipulation attacks in MAS, complementary security research on agent safety."
    510     }
    511   ]
    512 }

Impressum · Datenschutz