ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27930B)


      1 {
      2   "paper": {
      3     "title": "AegisAgent: An Autonomous Defense Agent Against Prompt Injection Attacks in LLM-HARs",
      4     "authors": [
      5       "Yihan Wang",
      6       "Huanqi Yang",
      7       "Shantanu Pal",
      8       "Weitao Xu"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv preprint",
     12     "arxiv_id": "2512.20986"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states in the Ethical Considerations section: 'Upon the publication of this work, we will release our source code.' This is a promise of future release, not an actual release. No repository URL or archive is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The evaluation uses three standard public benchmarks: USC-HAD, UCI HAR, and PAMAP2, all of which are publicly available datasets. The paper does not collect new proprietary data."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions experiments were run on 'a workstation equipped with three NVIDIA RTX 3090 GPUs (24GB memory each)' and uses Gemma-2-9B, but does not provide a requirements file, Dockerfile, conda environment, or a detailed listing of library versions needed to reproduce the work."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. The paper describes the system design and evaluation methodology but does not include enough operational detail for a researcher to reproduce the experiments without the (unpublished) code."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All reported metrics (DA, SC, ASR, RR, HS) are point estimates only. No confidence intervals, error bars, or standard deviations are reported despite 5 repeated runs being mentioned."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims AegisAgent 'significantly' outperforms baselines and achieves higher accuracy across all conditions, but no statistical significance tests (p-values, t-tests, etc.) are performed. Comparisons are made solely by comparing point estimates."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper consistently reports before/after comparisons with baseline context: ASR drops from 62.2% to 33.4% (Table 1), DA drops from ~85% to 7-8% without the Robust Reasoner (Section 5.3), AegisAgent achieves 93.0% DA vs. 64.9% for the best baseline (Section 5.5). Per the schema example ('12% improvement from 45% to 57%' = YES), these provide sufficient context for the magnitude of effects."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper evaluates 15 attack types across 5 target models and 3 datasets but does not justify the number of test instances used per attack or provide a power analysis. The number of test examples from each dataset used in evaluation is not stated."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper states 'each experiment is repeated five times, with average results reported' but does not report standard deviation, IQR, or any other spread measure across the 5 runs. Only averages are given."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares AegisAgent against three categories of baselines: text-only defenses (SafeDecoding), classical HAR defenses (FGSM/PGD adversarial training), and multimodal detection-only defenses (cross-modal consistency methods). Results are shown in Figure 7."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "SafeDecoding (2024) is a recent and relevant baseline. The other baselines (FGSM, PGD, multimodal detection methods) are standard and appropriate comparisons for different threat models."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.3 presents an ablation study (RQ2) removing each of the three components (Input Sanitizer, Consistency Verifier, Robust Reasoner) individually across all five target models. Results are shown in Figure 6."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports multiple metrics: Detection Accuracy (DA), Attack Success Rate (ASR), Recovery Rate (RR), Semantic Consistency (SC), and Harm Score (HS), providing several dimensions of evaluation."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is performed. The paper claims AegisAgent preserves 'semantic fidelity' and produces 'secure outputs,' but evaluation is entirely automated using metrics like DA, ASR, SC. Human evaluation of the repaired output quality would be relevant to validate claims about semantic correctness and practical safety."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper does not explicitly clarify train/test splits for the evaluation. While the target models (IMUGPT-2.0, etc.) are evaluated on their respective datasets, it is unclear whether the evaluation examples were distinct from any tuning or configuration choices made during AegisAgent setup."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 breaks down Harm Score by individual attack type and category (Signal, Text, Prompt, Hybrid). Table 1 shows per-model and per-LLM-classifier breakdowns. Figure 6 shows per-model ablation results."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not discuss specific failure cases of AegisAgent. It mentions that 'all stripped-down versions exhibit significant performance degradation' but does not analyze cases where the full system itself fails or types of attacks it cannot handle."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. The ablation study (Section 5.3) shows component removal hurts performance, which is expected positive validation, not a negative result. No approaches that were tried and abandoned, no configurations that failed, no unexpected findings are reported. Every experiment confirms AegisAgent's effectiveness."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 85% detection accuracy on average and 30% ASR reduction — both are supported by Table 1 (average DA ~85%) and the evaluation section. The 78.6 ms latency is supported by Table 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Ablation studies in Section 5.3 use controlled single-component removal to justify the causal claim that each component contributes to defense performance. The design supports causal claims about which components drive the results."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes broad claims about defending 'LLM-HAR systems' but only tests on five specific systems with Gemma-2-9B as the internal model. The title and framing suggest general applicability but the evaluation is specific to particular architectures and datasets. Claims like 'AegisAgent delivers the most consistent and comprehensive defense' extend beyond the tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for AegisAgent's performance gains. For example, whether the improvements could be partly explained by the added computational budget (Gemma-2-9B reasoning), or whether some of the performance gains on specific attacks might be due to dataset-specific properties rather than general defense capability."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper uses ChatGPT-4o, Gemini, LLaMA-2, DeepSeek-V3, and Gemma-2-9B as LLMs but does not specify exact model versions, API snapshot dates, or checkpoint identifiers. 'GPT-4o' and 'Gemini' without version qualifiers are insufficient."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Appendix B provides prompt templates for the five target LLM-HAR models (how they convert signals to prompts), but these are the TARGET SYSTEM prompts, not AegisAgent's own prompts. The actual prompts used by AegisAgent's Planning Agent, Executor Agent, and Robust Reasoner when invoking Gemma-2-9B are not provided. A reader cannot reconstruct the defense system's LLM queries."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper specifies key hyperparameters: similarity threshold tau_san=0.75, MAD threshold tau_MAD=2.5, graylist coefficient alpha=0.4, semantic mismatch threshold tau_c=0.35, semantic conflict threshold tau_sem=0.6, temporal threshold tau_temp=0.6, executor threshold tau_exec=0.85. These are described in Sections 4.2."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4 provides detailed description of the AegisAgent scaffolding, including the Input Sanitizer, Consistency Verifier, Robust Reasoner, Memory Hub, Planning Agent, and Executor Agent. The workflow is formalized mathematically and Figure 5 provides a system diagram."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper does not describe how raw IMU data from the three datasets was preprocessed before being fed to the LLM-HAR models. Preprocessing steps (windowing, normalization, feature extraction) are not documented with sufficient detail for reproduction."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion (Section 7) is brief and does not discuss limitations. Ethical considerations mention responsible disclosure but not methodological limitations."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No threats-to-validity section exists. The paper does not discuss threats specific to this study, such as limited number of attack types, restricted model coverage, or the evaluation only using simulated attacks rather than real-world adversaries."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what the results do NOT show. Claims about 'robustness against a wide range of adversarial perturbations' are not bounded to the tested setting. No explicit statements about what was NOT tested or claimed."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Only aggregated performance metrics are presented. Raw prediction results, detection outcomes per example, or the specific adversarial prompts tested are not released. The code and evaluation scripts are promised but not yet available."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper uses standard public datasets (USC-HAD, UCI HAR, PAMAP2) that are well-documented in prior work, and references the original publications for each dataset. Attacks are described formally in Section 3.2 and Appendix A (Table 4)."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants were recruited for this study. The evaluation is conducted on automated systems using pre-existing public datasets."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The pipeline from raw IMU signals through LLM-HAR models to evaluation metrics is described at a high level, but intermediate steps (data preprocessing, attack implementation details, evaluation instance selection) are not documented with sufficient detail to independently verify results."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "No acknowledgments section or funding disclosure is present in the paper. There is no mention of grants, institutional funding, or corporate support."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Author affiliations are listed in the header: City University of Hong Kong and Deakin University. None of the authors appear affiliated with the commercial LLM providers (OpenAI, Google, Meta) whose models are evaluated."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": false,
    212         "answer": false,
    213         "justification": "No funding source is disclosed, making this question inapplicable — there is no identified funder whose independence can be assessed."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "There is no competing interests statement in the paper. The absence of such a declaration cannot be taken as confirmation of no conflicts."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "This paper evaluates a defense system against prompt injection, not a pre-trained model's benchmark performance. AegisAgent is training-free; contamination of training data is not applicable to the main evaluation claims."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The evaluation does not test knowledge recall from pre-training. The paper tests defense capabilities against attack scenarios, not benchmark accuracy that could be contaminated by training data overlap."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The public HAR datasets used are sensor datasets for activity classification, not language benchmarks that LLMs might have memorized. Contamination of this type is not a concern for this evaluation design."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study. The evaluation is entirely automated using existing LLM-HAR systems and datasets."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved. The Ethical Considerations section focuses on responsible disclosure of attack vectors, not human subjects ethics."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were recruited. Demographics are not applicable."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants were involved, so inclusion/exclusion criteria for participants are not applicable."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants were involved and no randomization of experimental conditions over participants was required."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were involved. The evaluation is entirely automated with ground truth labels from the datasets."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants were involved, so attrition is not applicable."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 5.6 (RQ5) reports detailed latency measurements: total 78.6 ms per query including per-module breakdowns (Input Sanitization: 6.3 ms, Consistency Verification: 31.1 ms, etc.). Hardware is specified as three NVIDIA RTX 3090 GPUs."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "While per-query latency is reported, the total computational budget for the full evaluation (total GPU hours, API costs for GPT-4o and Gemini calls across 15 attack types × 5 models × 3 datasets × 5 repeated runs) is not stated."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "AegisAgent achieves 85% detection accuracy on average and reduces attack success rate by 30% on average across five LLM-HAR systems.",
    291       "evidence": "Table 1 shows DA ranging from 78.5% to 93.0% across target model and LLM classifier combinations. The average is reported as 85% DA in Section 5.2. ASR drops from approximately 59-64% (without AegisAgent) to 27-42% (with AegisAgent).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "AegisAgent significantly outperforms three categories of state-of-the-art defenses (text-only, classical HAR, multimodal detection-only).",
    296       "evidence": "Figure 7 compares DA, RR, and ASR: AegisAgent achieves 93.0% DA and 56.3% RR vs. the best baseline (multimodal detect) at 64.9% DA and 21.7% RR. No statistical significance tests are performed.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "All three defense components (Input Sanitizer, Consistency Verifier, Robust Reasoner) are necessary, with the Robust Reasoner being the most critical.",
    301       "evidence": "Section 5.3 ablation study shows removing the Input Sanitizer reduces DA to 53-57%, removing the Consistency Verifier to 46-49%, and removing the Robust Reasoner drops DA to 7-8% (Figure 6). No variance reported.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Hybrid attacks pose the most severe threat to LLM-HAR systems, with Harm Scores reaching 3.8-4.1 compared to 2.6-2.8 for text path attacks.",
    306       "evidence": "Table 2 reports Harm Score averages by attack category: Text Attacks avg 2.70, Signal Attacks avg 3.55, Prompt Attacks avg 3.00, Hybrid Attacks avg 3.97. The scoring rubric (1-5 scale based on safety-critical distance between predicted and true label) is defined in Section 5.1.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "AegisAgent incurs only 78.6 ms of latency overhead per query on a GPU workstation.",
    311       "evidence": "Table 3 provides per-module latency measurements summing to 78.6 ms. Measured on three NVIDIA RTX 3090 GPUs. Only GPU server performance reported; mobile/edge deployment performance is not assessed.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval",
    317     "case-study"
    318   ],
    319   "key_findings": "AegisAgent is a training-free, model-agnostic autonomous defense agent for LLM-based human activity recognition (HAR) systems that defends against prompt injection attacks via three-stage processing: input sanitization, cross-modal consistency verification, and robust reasoning. Evaluated across 15 attack types on 5 LLM-HAR systems and 3 public datasets, it achieves 85% average detection accuracy and reduces attack success rates by approximately 30 percentage points. An ablation study shows each of the three components contributes to performance, with the Robust Reasoner (using Gemma-2-9B with chain-of-thought and multi-path voting) being the most critical. The system adds only 78.6 ms of latency overhead on a GPU workstation, suggesting practical deployment feasibility in server settings.",
    320   "red_flags": [
    321     {
    322       "flag": "No statistical tests for comparisons",
    323       "detail": "The paper makes claims of 'significantly' outperforming baselines throughout, but no statistical significance tests are applied. All comparisons are based on point estimates, making it impossible to assess whether differences are meaningful or could be due to random variation."
    324     },
    325     {
    326       "flag": "No variance reporting despite repeated runs",
    327       "detail": "Section 5.1 states experiments are repeated 5 times with average results reported, but no standard deviation, confidence intervals, or other spread measures are reported for any metric. This conceals result stability and makes it impossible to assess reliability."
    328     },
    329     {
    330       "flag": "Model versions not specified",
    331       "detail": "The paper uses ChatGPT-4o, Gemini, LLaMA-2, and DeepSeek-V3 as HAR classifiers but provides no specific version identifiers or API snapshot dates. Model behavior changes across versions, making results non-reproducible."
    332     },
    333     {
    334       "flag": "Code not released",
    335       "detail": "Code is promised 'upon publication' but not currently available. Without code, the 78.6 ms latency claim, the specific threshold values, and the Memory Hub implementation cannot be independently verified."
    336     },
    337     {
    338       "flag": "No limitations section",
    339       "detail": "The paper has no dedicated limitations or threats-to-validity section. Important limitations — such as evaluation only under simulated white-box attacks, restriction to five specific LLM-HAR architectures, and GPU server context for latency — are not discussed."
    340     },
    341     {
    342       "flag": "Overgeneralized claims",
    343       "detail": "The paper claims to defend 'LLM-HAR systems' generally but only tests five specific architectures with Gemma-2-9B as the internal agent model. Whether the approach generalizes to other LLM-HAR architectures or different backbone LLMs is untested."
    344     },
    345     {
    346       "flag": "No disclosure of funding",
    347       "detail": "No funding source or acknowledgments section is present. This omission makes it impossible to assess potential conflicts of interest or institutional support."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "IMUGPT 2.0: Language-based cross modality transfer for sensor-based human activity recognition",
    353       "authors": [
    354         "Zikang Leng",
    355         "Amitrajit Bhattacharjee",
    356         "Hrudhai Rajasekhar",
    357         "Lizhe Zhang",
    358         "Elizabeth Bruda",
    359         "Hyeokhyen Kwon",
    360         "Thomas Plötz"
    361       ],
    362       "year": 2024,
    363       "relevance": "Primary target LLM-HAR system evaluated; represents state-of-the-art LLM integration with IMU sensing."
    364     },
    365     {
    366       "title": "MotionGPT: Human motion as a foreign language",
    367       "authors": [
    368         "Yuxiao Jiang",
    369         "Wen Li",
    370         "Tianyu Liu",
    371         "Hao Hong",
    372         "Taku Komura",
    373         "Ziwei Liu"
    374       ],
    375       "year": 2023,
    376       "arxiv_id": "2306.08993",
    377       "relevance": "One of five target LLM-HAR systems evaluated; bidirectional text-motion generation model."
    378     },
    379     {
    380       "title": "HAR-GPT: Harnessing LLMs for IMU-based human activity recognition",
    381       "authors": [
    382         "Zhiwei Yang",
    383         "Jie Chen",
    384         "Rui Zhao",
    385         "Cheng Xu",
    386         "Jing Liu"
    387       ],
    388       "year": 2024,
    389       "arxiv_id": "2402.02354",
    390       "relevance": "Target system demonstrating zero-shot HAR via LLM prompting — evaluated as one of five systems."
    391     },
    392     {
    393       "title": "LLaSA: Large language and activity sensing agents",
    394       "authors": [
    395         "Ali Imran",
    396         "Yichuan Xu",
    397         "Dohyun Kim",
    398         "Longqi Chen",
    399         "Hao Zhang",
    400         "Yu Gao"
    401       ],
    402       "year": 2024,
    403       "arxiv_id": "2403.04567",
    404       "relevance": "Target LLM-HAR system with multimodal perception fusion; represents state-of-the-art activity sensing."
    405     },
    406     {
    407       "title": "Universal and transferable adversarial attacks on aligned language models",
    408       "authors": [
    409         "Andy Zou",
    410         "Zhen Wang",
    411         "Nicholas Carlini",
    412         "Milad Nasr",
    413         "Zico Kolter",
    414         "Matt Fredrikson",
    415         "Nicolas Papernot"
    416       ],
    417       "year": 2023,
    418       "arxiv_id": "2307.15043",
    419       "relevance": "Key baseline for prompt injection/jailbreak attacks against LLMs; used as baseline defense comparison."
    420     },
    421     {
    422       "title": "More than you've asked for: A comprehensive analysis of jailbreak attacks against large language models",
    423       "authors": [
    424         "Kai Greshake",
    425         "Tobias Wüchner",
    426         "Thomas Wolf"
    427       ],
    428       "year": 2023,
    429       "arxiv_id": "2309.10253",
    430       "relevance": "Prior work on prompt injection and jailbreak attacks in text-only LLM settings — motivates need for AegisAgent."
    431     },
    432     {
    433       "title": "Ignore previous prompt: Attack techniques for prompt-based LLMs",
    434       "authors": [
    435         "Ethan Perez",
    436         "Juliana Rando",
    437         "Douwe Kiela"
    438       ],
    439       "year": 2022,
    440       "arxiv_id": "2211.09527",
    441       "relevance": "Foundational work on prompt injection attack techniques directly relevant to the threat model studied."
    442     },
    443     {
    444       "title": "SafeDecoding: Defending language models against jailbreak attacks",
    445       "authors": [
    446         "Jiapeng Zeng",
    447         "Yihan Zhang",
    448         "Bo Li"
    449       ],
    450       "year": 2024,
    451       "relevance": "Primary text-only defense baseline compared against AegisAgent in RQ4 evaluation."
    452     },
    453     {
    454       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    455       "authors": [
    456         "Jason Wei",
    457         "Xuezhi Wang"
    458       ],
    459       "year": 2022,
    460       "relevance": "Key technique used in AegisAgent's Robust Reasoner module for structured multi-step reasoning under adversarial conditions."
    461     },
    462     {
    463       "title": "Self-consistency improves chain of thought reasoning in language models",
    464       "authors": [
    465         "Xuezhi Wang",
    466         "Jason Wei"
    467       ],
    468       "year": 2023,
    469       "relevance": "Multi-path voting mechanism used in AegisAgent's Robust Reasoning module to aggregate predictions."
    470     },
    471     {
    472       "title": "Prompt injection attack against LLM-integrated applications",
    473       "authors": [
    474         "Han Guo",
    475         "Qi Zhang",
    476         "Yang Ji"
    477       ],
    478       "year": 2024,
    479       "arxiv_id": "2403.02276",
    480       "relevance": "Recent work on prompt injection attacks in LLM-integrated applications — directly relevant to the attack surface studied."
    481     },
    482     {
    483       "title": "Gemma: Open models based on Gemini research and technology",
    484       "authors": [
    485         "Gemma Team",
    486         "Google DeepMind",
    487         "Google Research"
    488       ],
    489       "year": 2024,
    490       "arxiv_id": "2403.08295",
    491       "relevance": "Gemma-2-9B is the backbone LLM used in AegisAgent's Planning and Executor agents."
    492     }
    493   ]
    494 }

Impressum · Datenschutz