ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27987B)


      1 {
      2   "paper": {
      3     "title": "AEGIS: Automated Co-Evolutionary Framework for Guarding Prompt Injection",
      4     "authors": [
      5       "Ting-Chun Liu",
      6       "Ching-Yu Hsu",
      7       "Kuan-Yi Lee",
      8       "Chi-An Fu",
      9       "Hung-yi Lee"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2509.00088"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No GitHub link, Zenodo archive, or other repository URL is provided in the paper. No mention of code release in abstract, footnotes, or anywhere in the text."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The dataset consists of student-submitted articles from a National Taiwan University course. The paper states they were 'manually modified to anonymize personal data and for copyright purposes' but no download link or public release of the data is provided."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper lists the LLM used (gpt-4.1-mini) in the hyperparameters table (Appendix A.3), but does not provide library versions, a requirements.txt, Dockerfile, or other environment specification sufficient to reproduce the environment."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No README, step-by-step commands, or reproduction scripts are referenced. The algorithms (Algorithms 1 and 2) describe the procedure at a high level, but are not runnable reproduction instructions."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 4.2 states experiments were conducted three times and standard deviation was calculated. Figures 4 and 5 show shaded regions representing standard deviation across runs."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No statistical significance tests (t-tests, p-values, bootstrap, etc.) are applied when comparing AEGIS to baselines. Claims of outperformance are based solely on comparing point estimates (e.g., TPR 0.84 vs. 0.64) without any significance testing."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The abstract explicitly states the defense improves TPR by 0.20 compared to the previous state of the art (from 0.64 to 0.84), with only a slight decrease in TNR of 0.02. Table 1 provides the baseline context needed to assess effect magnitude."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The training set uses 50 GPT-generated benign articles (40 train, 10 validation), and evaluation uses 143 malicious + 100 benign student articles. No power analysis or justification for these sample sizes is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 4.2 states: 'all experiments were conducted three times. The results presented in this paper are the average values from these three runs. We also calculated the standard deviation.' Figures 4 and 5 show shaded standard deviation regions."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Table 1 compares AEGIS against three baselines: Perplexity-based Detection (Alon and Kamfonas, 2023), LLaMA 3.1 Guard (Inan et al., 2023), and the Human-Crafted Prompt from Chiang et al. (2024)."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines are from 2023-2025 and include LLaMA Guard (2023) and recently published prompt injection defenses (PromptArmor 2025, MELON 2025). The Human-Crafted Prompt (Chiang et al., 2024) is the direct predecessor on the same dataset."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section 6 presents ablation studies removing the gradient buffer (Section 6.1), removing multiple gradients (Section 6.2), and training only one side of the GAN (Section 6.3). Figures 4 and 5 show the performance differences across ablation conditions."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Both True Positive Rate (TPR) and True Negative Rate (TNR) are reported for defense evaluation, and Attack Success Rate (ASR) and relative score change (delta_Srel) are reported for attacker evaluation."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "The paper's claims are about automated prompt injection detection performance on a pre-labeled dataset. Human evaluation of the system's outputs is not relevant to the core claims; TPR/TNR against pre-labeled attack/benign articles is the appropriate evaluation."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4.1 clearly distinguishes training data (50 GPT-generated benign articles, 40 train / 10 validation) from the real-world evaluation set (143 malicious student articles + 100 benign articles from a prior course). The evaluation set was not used during training."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The 143 malicious articles contain 'a wide variety of successful prompt injections' with different strategies. A per-category breakdown by injection type would be meaningful but is not provided. Only aggregate TPR and TNR are reported, hiding potential variation across injection strategies."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper does not provide error analysis or qualitative examples of failure cases. Appendix A.4 shows defense prompt improvement examples (successes), but no cases where the defense fails to detect injections are discussed."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The ablation study in Section 6 reports components that degrade performance when removed: removing the gradient buffer leads to ~5% TPR degradation and slower convergence; removing multiple gradients leads to over 10% performance degradation."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims TPR improvement of 0.20 over the previous state of the art with only 0.02 decrease in TNR. Table 1 confirms: AEGIS Iteration 8 achieves TPR=0.84 vs. Human-Crafted Prompt TPR=0.64 (difference 0.20), and TNR=0.89 vs. 0.91 (difference -0.02). The claim about outperforming existing baselines is confirmed by Table 1."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Ablation studies (Section 6) use controlled single-variable manipulation to support causal claims about component contributions. The co-evolution, gradient buffer, and multi-route gradients are each removed individually, providing adequate causal evidence for their contributions."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper is evaluated on a single task (automated assignment grading at NTU) and the Limitation section acknowledges this ('our evaluation focused on automated assignment grading task, which may not fully capture the diversity of real-world security-sensitive tasks'). However, the abstract and title claim general applicability ('Automated Co-Evolutionary Framework for Guarding Prompt Injection') without consistently bounding the scope to the specific tested setting."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for why AEGIS outperforms baselines. No consideration is given to whether improvements might be due to dataset-specific factors (e.g., the student article data being from the same institution as the defense prompt development), or to the model (gpt-4.1-mini) being particularly well-suited to the evaluation setup."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper uses 'gpt-4.1-mini', 'GPT-5-mini', 'GPT-4.1-nano', 'Gemini-2.0-flash', 'Gemini-2.5-flash-lite'. Per the schema: 'Marketing names like \"Gemini-2.5\" or \"GPT-4o\" without a snapshot date or API version do NOT count as specified versions.' No snapshot dates or API version identifiers are provided for any model."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix A.4 (Table 6) shows actual full prompt text for the defense prompts at GAN iterations 0, 4, and 8. Appendix A.6 (Table 11) provides the full Human-Crafted Prompt baseline text. The actual prompts sent to the model are shown."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Table 5 (Appendix A.3) lists framework-level hyperparameters (GAN iterations, optimization iterations, weights, power parameters) but does NOT include LLM API settings: temperature, top-p, max tokens, or sampling parameters. The schema states: 'If the paper uses an LLM API without stating temperature/sampling settings, NO.' The paper calls GPT and Gemini APIs extensively without reporting these settings."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The agentic co-evolutionary scaffolding is described in detail in Section 3 and Appendix A.1-A.2, including Algorithm 1 (co-evolution framework), Algorithm 2 (TGO+ workflow), the evaluation functions Eval() and Val(), the gradient acquisition and application steps, and the gradient buffer mechanism."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.1 describes that the 143 malicious student articles were 'manually modified to anonymize personal data and for copyright purposes, while preserving their original strategic intent.' The training/validation split (40/10) and evaluation set composition (143 malicious + 100 benign) are clearly stated."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The paper has a dedicated 'Limitation' section (after Section 7/Conclusion) with substantive discussion of three specific limitations: single-task evaluation scope, text-only system focus, and lack of large-scale human evaluation."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Limitation section provides specific threats: (1) evaluation limited to automated assignment grading task, (2) defense targets only text-based dialogue systems with effectiveness in multimodal settings unclear, (3) no large-scale human evaluation conducted. These are specific to the study rather than generic disclaimers."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The Limitation section explicitly states what was NOT shown: generalizability to diverse real-world security tasks is not demonstrated, effectiveness in multimodal systems is not tested, and large-scale human evaluation was not conducted."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The student-submitted articles used for evaluation are not publicly released. Section 4.1 mentions they were modified for anonymization and copyright purposes, but no download link or data access mechanism is provided."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 4.1 describes the data sources: 50 GPT-generated benign articles for training/validation; 143 malicious articles and 100 benign articles from student submissions in a prior NTU course (same course as Chiang et al., 2024). The selection criteria are stated: malicious articles are ones that 'achieve full scores without being detected by the defense.'"
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "The data consists of course submissions from an existing course, not recruited participants. This is archival data from a prior course, not a human subjects study requiring participant recruitment description."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.1 documents the full data pipeline: GPT-generated benign articles for training (40) and validation (10); real-world evaluation uses 143 malicious student articles (those bypassing existing defense) + 100 benign student articles. The transformation step (anonymization/modification) is mentioned."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "There is no acknowledgments section and no mention of funding sources anywhere in the paper. It is unclear whether this is unfunded work or whether funding was simply not disclosed."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are affiliated with Electrical Engineering at National Taiwan University, as stated on the first page with institutional email addresses. No conflict of interest exists with LLM providers; the paper evaluates GPT and Gemini models from third parties."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No funding is disclosed; cannot assess funder independence. Marked as not applicable since funding source is absent from the paper."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement in the paper. Absence of disclosure is not the same as absence of conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The paper tests a defense framework/tool against prompt injection attacks, not evaluating a pre-trained model's capability on a benchmark. Per the schema: NA for 'studies that test defenses/tools rather than model knowledge.' The LLMs are used as tools within the defense system."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Same reasoning: the paper tests a defense tool, not model knowledge on a benchmark. Contamination concerns about whether the model memorized benchmark data are not the primary concern here."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "Same reasoning: the paper evaluates a defense framework, not a model's benchmark performance. The schema specifies NA for papers that test defenses/tools rather than model knowledge."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The paper does not involve human participants in its experiments. The student-submitted data is archival course data, not a prospective human subjects study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The paper uses archival student course data that was already collected and anonymized. No prospective human subject enrollment is conducted, so IRB approval is not required (though the use of student data may warrant mention of ethics review)."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in the experiments."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are recruited; the data consists of archival course submissions."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved; no randomization to experimental conditions is needed."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants or human evaluators of system outputs are involved."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in the experiments."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The framework calls GPT-4.1-mini and other OpenAI/Gemini APIs repeatedly across 8 GAN iterations with multiple optimization iterations each, but no API costs, token usage, or wall-clock time is reported."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget, GPU hours, total API spend, or hardware specifications are provided for the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "AEGIS improves the true positive rate (TPR) by 0.20 compared to the previous state of the art, with only a slight decrease in the true negative rate (TNR) of 0.02.",
    292       "evidence": "Table 1 (Section 5.1): AEGIS at Iteration 8 achieves TPR=0.84 vs. Human-Crafted Prompt TPR=0.64 (difference 0.20) and TNR=0.89 vs. 0.91 (difference -0.02). Three-run average with standard deviation reported.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Co-evolution is critical: training only one side of the GAN leads to overfitting to a static opponent and lower overall TPR.",
    297       "evidence": "Section 6.3 ablation study: 'The lack of an adaptive adversary meant that the trained model quickly overfit to its static opponent.' Figure 5 shows single-sided training achieves lower final TPR than the default method.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The gradient buffer improves convergence and TPR by approximately 5%.",
    302       "evidence": "Section 6.1: 'This change leads to a slower convergence rate and degrades the TPR in defense about 5%.' Figure 5 shows ablation without gradient buffer achieving lower TPR.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "The AEGIS framework generalizes across multiple LLMs, with all tested models achieving better TPR at higher GAN iterations.",
    307       "evidence": "Table 3 (Section 5.3.1): GPT-5-mini, GPT-4.1-nano, Gemini-2.0-flash, and Gemini-2.5-flash-lite all show increasing TPR from GAN iteration 0 to 4. Table 4 (Section 5.3.2) shows transfer of GPT-4.1-mini-generated prompts to other models also improves with more iterations.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Defense prompts generated by AEGIS improve qualitatively over GAN iterations, becoming more specific and comprehensive.",
    312       "evidence": "Appendix A.4 (Table 6) shows the actual defense prompt text at iterations 0, 4, and 8, demonstrating increasing specificity in detecting subtle manipulation strategies. The iteration 0 prompt is short and general; iteration 8 covers indirect appeals, conditional statements, and specific manipulation patterns.",
    313       "supported": "strong"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval"
    318   ],
    319   "key_findings": "AEGIS is an automated co-evolutionary framework that jointly optimizes attack and defense prompts against prompt injection attacks using a GAN-inspired iterative process with textual gradient optimization (TGO+). On a real-world dataset of student-submitted articles with prompt injections from a National Taiwan University course, AEGIS achieves TPR=0.84 and TNR=0.89, outperforming the best baseline (Human-Crafted Prompt, TPR=0.64, TNR=0.91) by 0.20 TPR at the cost of 0.02 TNR. Ablation studies confirm that co-evolution, gradient buffering, and multi-objective optimization each contribute to performance. The framework generalizes across GPT and Gemini model families without model fine-tuning.",
    320   "red_flags": [
    321     {
    322       "flag": "Single narrow task generalization",
    323       "detail": "All evaluation is conducted on a single specific task (automated assignment grading at NTU) with data from a single course. The paper frames this as a general prompt injection defense framework but tests only one domain. The Limitation section acknowledges this but the abstract and introduction make broad claims about prompt injection defense generally."
    324     },
    325     {
    326       "flag": "No statistical significance testing",
    327       "detail": "Comparisons between AEGIS and baselines are made by comparing point estimates in Table 1 (e.g., TPR 0.84 vs. 0.64) without any statistical significance testing. With a test set of 243 articles (143 malicious + 100 benign) and three experimental runs, statistical testing would be feasible and warranted."
    328     },
    329     {
    330       "flag": "No inference cost reporting",
    331       "detail": "The framework calls large LLM APIs (GPT-4.1-mini, GPT-4o, Gemini models) across 8 GAN iterations with multiple optimization iterations each. No API costs or token consumption is reported, making it impossible to assess the practical cost of deploying this system."
    332     },
    333     {
    334       "flag": "Potential benchmark contamination not discussed",
    335       "detail": "The evaluation uses student articles from a 2024 NTU course published in an EMNLP 2024 paper (Chiang et al., 2024). GPT-4.1-mini and Gemini-2.5 models likely have training data cutoffs that include this published work, but no contamination analysis is provided."
    336     },
    337     {
    338       "flag": "Model version ambiguity",
    339       "detail": "The paper uses 'gpt-4.1-mini', 'GPT-5-mini', 'GPT-4.1-nano', 'Gemini-2.5-flash', etc. as model identifiers. These are not standard API versioning identifiers with snapshot dates, making exact reproduction difficult as these models may have changed since the experiments were run."
    340     },
    341     {
    342       "flag": "Training data is LLM-generated",
    343       "detail": "The training set consists of 50 GPT-generated benign articles, not authentic student submissions. This raises a question about whether the optimization might overfit to GPT-style text distributions, though the real-world evaluation on actual student submissions partially mitigates this concern."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Large language model as an assignment evaluator: Insights, feedback, and challenges in a 1000+ student course",
    349       "authors": [
    350         "Cheng-Han Chiang",
    351         "Wei-Chih Chen",
    352         "Chun-Yi Kuan",
    353         "Chienchou Yang",
    354         "Hung-Yi Lee"
    355       ],
    356       "year": 2024,
    357       "relevance": "Provides the real-world dataset and Human-Crafted Prompt baseline that AEGIS is evaluated against; directly relevant as the predecessor work on LLM-based grading and prompt injection defense."
    358     },
    359     {
    360       "title": "Automatic prompt optimization with 'gradient descent' and beam search",
    361       "authors": [
    362         "Reid Pryzant",
    363         "Dan Iter",
    364         "Jerry Li",
    365         "Yin Tat Lee",
    366         "Chenguang Zhu",
    367         "Michael Zeng"
    368       ],
    369       "year": 2023,
    370       "arxiv_id": "2305.03495",
    371       "relevance": "Introduces the TGO (Textual Gradient Optimization) framework that AEGIS extends, making it directly foundational to the paper's methodology."
    372     },
    373     {
    374       "title": "Detecting language model attacks with perplexity",
    375       "authors": [
    376         "Gabriel Alon",
    377         "Michael Kamfonas"
    378       ],
    379       "year": 2023,
    380       "arxiv_id": "2308.14132",
    381       "relevance": "Serves as one of three baselines evaluated against in Table 1; relevant to prompt injection defense benchmarking."
    382     },
    383     {
    384       "title": "Llama guard: LLM-based input-output safeguard for human-AI conversations",
    385       "authors": [
    386         "Hakan Inan",
    387         "Kartikeya Upasani",
    388         "Jianfeng Chi"
    389       ],
    390       "year": 2023,
    391       "arxiv_id": "2312.06674",
    392       "relevance": "Serves as one of three baselines (LLaMA 3.1 Guard) evaluated against in Table 1; relevant to LLM safety and content moderation."
    393     },
    394     {
    395       "title": "Defending against indirect prompt injection attacks with spotlighting",
    396       "authors": [
    397         "Keegan Hines",
    398         "Gary Lopez",
    399         "Matthew Hall",
    400         "Federico Zarfati",
    401         "Yonatan Zunger",
    402         "Emre Kiciman"
    403       ],
    404       "year": 2024,
    405       "arxiv_id": "2403.14720",
    406       "relevance": "Discussed as a representative training-free defense approach (input-level structure encoding) in the related work section."
    407     },
    408     {
    409       "title": "Robust prompt optimization for defending language models against jailbreaking attacks",
    410       "authors": [
    411         "Andy Zhou",
    412         "Bo Li",
    413         "Haohan Wang"
    414       ],
    415       "year": 2024,
    416       "relevance": "Presents RPO, the most closely related adversarial optimization framework for prompt defense, but operates in a white-box setting in contrast to AEGIS's black-box approach."
    417     },
    418     {
    419       "title": "Survival of the safest: Towards secure prompt optimization through interleaved multi-objective evolution",
    420       "authors": [
    421         "Ankita Sinha",
    422         "Wendi Cui",
    423         "Kamalika Das",
    424         "Jiaxin Zhang"
    425       ],
    426       "year": 2024,
    427       "arxiv_id": "2410.09652",
    428       "relevance": "Closely related multi-objective evolutionary prompt optimization framework that inspired AEGIS's multi-objective scoring approach."
    429     },
    430     {
    431       "title": "Automatic prompt optimization via heuristic search: A survey",
    432       "authors": [
    433         "Wendi Cui",
    434         "Jiaxin Zhang",
    435         "Zhuohang Li",
    436         "Hao Sun",
    437         "Damien Lopez",
    438         "Kamalika Das",
    439         "Bradley A Malin",
    440         "Sricharan Kumar"
    441       ],
    442       "year": 2025,
    443       "arxiv_id": "2502.18746",
    444       "relevance": "Survey of prompt optimization techniques that provides the taxonomy (heuristic-based, static vs. evolving environments) used to situate AEGIS in the literature."
    445     },
    446     {
    447       "title": "MELON: Indirect prompt injection defense via masked re-execution and tool comparison",
    448       "authors": [
    449         "Kaijie Zhu",
    450         "Xianjun Yang",
    451         "Jindong Wang",
    452         "Wenbo Guo",
    453         "William Yang Wang"
    454       ],
    455       "year": 2025,
    456       "relevance": "Discussed as a representative behavioral consistency checking defense approach in the related work; directly relevant to prompt injection defense comparisons."
    457     },
    458     {
    459       "title": "PromptArmor: Simple yet effective prompt injection defenses",
    460       "authors": [
    461         "Tianneng Shi",
    462         "Kaijie Zhu",
    463         "Zhun Wang"
    464       ],
    465       "year": 2025,
    466       "arxiv_id": "2507.15219",
    467       "relevance": "Discussed as a representative LLM-based detection prompt defense approach in the related work section."
    468     }
    469   ]
    470 }

Impressum · Datenschutz