ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (30471B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FATH: Authentication-based Test-time Defense against Indirect Prompt Injection Attacks",
      6     "authors": [
      7       "Jiong Wang",
      8       "Fangzhou Wu",
      9       "Wen-Ding Li",
     10       "Jinsheng Pan",
     11       "Edward Suh"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2410.21492",
     16     "doi": "10.48550/arXiv.2410.21492"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'near 0% ASR on GPT3.5 for various attack methods, surpassing all previous defenses,' which is confirmed by Table 2 showing 0.00-0.02 ASR across all attacks. The abstract also specifies 'under Llama3 and GPT3.5 models,' appropriately bounding the performance claim.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims that FATH's components (Authentication Tags, Security Policy) contribute to defense effectiveness. The ablation study in Section 5.6 provides controlled single-variable manipulation evidence, showing removal of Security Policy increases ASR by 30%+ under adaptive attacks (Table 4).",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "While the abstract bounds specific performance claims to 'Llama3 and GPT3.5,' the title and broader claims like 'effectively defend against indirect prompt injection attacks' and 'securing LLM-integrated applications' are not bounded to the two models tested. The Limitations section partially addresses this but the framing remains broader than the evidence.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternative explanations for the observed results. For example, it does not discuss whether the low ASR could be due to prompt complexity overwhelming the model rather than the authentication mechanism specifically, or whether the effectiveness is specific to the tested attack patterns.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures Attack Success Rate, which directly corresponds to its claims about defense against prompt injection attacks. The Judge Score metric is also clearly framed as a measure of generation quality impact. No proxy-outcome gap exists.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing three specific limitations of FATH.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The Limitations section discusses threats specific to this study: (1) substantial manual effort for prompt design across applications, (2) reliance on instruction-following ability making FATH ineffective on weaker models like Alpaca, (3) limited benchmark coverage that does not include real tool usage scenarios.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper states specific boundaries: FATH 'may be reduced when applied to LLMs with comparatively weaker instruction-following abilities,' and current benchmarks 'can not provide real tool usage scenarios' so they 'directly provide external text information to simulate the results of tool execution.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or grant information appears in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: UW-Madison, Huazhong University, University of Rochester, NVIDIA, Cornell University, University of Michigan, UC-Davis.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding information is disclosed, making it impossible to assess funder independence. One author (Edward Suh) is affiliated with NVIDIA, which has a commercial interest in LLM security.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interest declaration appears in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1 formally defines the attack function A, defense function F, and the distinction between user instructions and external text; 'indirect prompt injection attack' is defined in contrast to direct attacks in Section 2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly frames its contribution as a novel test-time defense (FATH) requiring no model fine-tuning, positioned as solving the gap in existing test-time defenses against adaptive attacks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 distinguishes FATH from prior training-time defenses and existing test-time methods (Instructional Prevention, Sandwich, ICL), explaining what each lacks and how FATH addresses the gap.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract explicitly states: 'Our code is released at: https://github.com/Jayfeather1024/FATH'.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All benchmark data is publicly available: Stanford Alpaca (Apache-2.0), QA from Zverev et al. 2024 (CC BY 4.0), CLF from OpenPromptInjection (CC BY 4.0), InjecAgent (MIT), and URLs generated by the Python 'fake' package (MIT). Appendix G documents all dataset details and licenses.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions 'Meta-Llama-3-8B-Instruct with 1x NVIDIA A100 GPU' and 'gpt-3.5-turbo with OpenAI API' but provides no requirements.txt, Dockerfile, or detailed library version list for dependencies like the hmac package or Sentence Transformers.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "While the GitHub repository is provided and prompt templates are given in appendices, the paper itself does not include step-by-step reproduction instructions (commands to run, scripts to execute, or a 'Reproducing Results' section).",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All ASR values in Tables 2 and 3 are reported as point estimates (e.g., 0.08, 0.00, 0.26) with no confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims FATH outperforms all baselines based on comparing raw ASR numbers without any statistical significance tests (no p-values, no t-tests, no bootstrap tests).",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Results in Tables 2 and 3 show both baseline and defense ASR values, providing clear context for the magnitude of improvement (e.g., No Defense ASR 0.60 → FATH ASR 0.00 for GPT-3.5 URL Injection under Combined Attack).",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 100 text examples per injection task for OpenPromptInjection+ and 510 for InjecAgent without justifying these sample sizes or performing power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or multiple-run results are reported. All results appear to be single-run point estimates.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 2 compares FATH against four baseline test-time defenses: Instructional Prevention, Sandwich Prevention, Text Instruction Isolation, and ICL Defense, plus a No Defense setting.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines are from Liu et al. 2023b and Yi et al. 2023, which are the most recent test-time defense methods for indirect prompt injection at the time of writing.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 5.6 and Table 4 present ablation studies removing Authentication Tags and Security Policy individually, showing the contribution of each component.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports both Attack Success Rate (ASR) for defense effectiveness and Judge Score for generation quality impact (Table 2).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation is automated: ASR is computed programmatically and Judge Score uses GPT-3.5 as an LLM judge. No human evaluation of defense effectiveness or output quality.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The paper does not describe a separate dev/test split. In-context examples are selected via semantic similarity to the user instruction, which means the ICL selection process operates on the same benchmark used for evaluation, without a clear separation.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by three injection task categories (URL, QA, CLF), by five attack methods plus adaptive attacks, and across two models (Table 2).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "While the paper reports that FATH has higher ASR under adaptive attacks on Llama3 (0.26-0.34 in Table 2), it does not analyze specific failure cases or explain why the defense fails in those instances.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that FATH reduces the Judge Score from 8.31 to 6.73 on Llama3 and from 7.94 to 6.91 on GPT-3.5, acknowledging 'a small decrease in the Judge Score.' It also reports that FATH under Llama3 still allows 26-34% ASR under adaptive attacks.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "The paper specifies 'Meta-Llama-3-8B-Instruct' (a specific model version) but only 'gpt-3.5-turbo' without a snapshot date or API version for the OpenAI model. GPT-3.5-turbo changes behavior over time without a pinned version.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt templates are provided in Figures 3, 4, 7, 8, and appendices D, E, with the actual text used. The security policy, in-context examples, attack templates, and adaptive attack prompts are all included with explicit text.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Section 5.2 states 'We set all parameters to default for model generation' without specifying the actual values of temperature, top-p, or other sampling parameters.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "FATH is a prompt-based defense method, not an agentic scaffolding system. It uses HMAC tag generation and rule-based parsing, but these are not agentic scaffolding.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 5.1 describes how OpenPromptInjection+ was constructed: selecting examples from Stanford Alpaca with both 'instruction' and 'input', defining three injection task categories (URL, QA, CLF), and details on dataset sources. Appendix G provides dataset licenses and sources.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The paper does not release raw experimental outputs, LLM responses, or per-example evaluation logs for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 5.1 and Appendix G describe benchmark construction: 100 examples from Stanford Alpaca per injection task, 510 examples from InjecAgent for direct harm, with sources and licenses documented.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. All data sources are standard public benchmarks and programmatically generated data.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The paper documents the pipeline from data source selection (Section 5.1), through attack injection (Appendix C), defense prompt construction (Section 4 and appendices), to evaluation metric computation (Section 5.2). Dataset construction for OpenPromptInjection+ is described step by step.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This paper tests a defense method against prompt injection attacks, not model knowledge or capability on a benchmark. Contamination of training data with benchmark answers is not the relevant concern.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "The paper evaluates defense effectiveness, not model knowledge. Whether the model saw similar attack patterns during training is a different concern from traditional train/test contamination.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Same reasoning: the paper tests defense robustness, not whether models memorized benchmark answers.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "FATH requires generating HMAC tags, running semantic search for ICL examples, and querying the LLM with substantially longer prompts (the defense prompt in Figure 3 is very long). No inference cost, latency, or API cost is reported.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The paper mentions '1x NVIDIA A100 GPU' for Llama3 but does not state total GPU hours, API costs, or wall-clock time for the experiments.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "All results appear to be from single runs with no seed sensitivity analysis reported.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The paper does not state how many runs produced the reported results. No mention of averaging across runs or number of trials.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "FATH involves design choices (number of ICL examples, tag generation, prompt template structure) but no hyperparameter search budget is reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The paper presents one configuration of FATH (5 tags, N+1 ICL examples) without explaining how this configuration was selected or whether alternatives were tried.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper makes many comparisons across 5 defense methods, 6 attack types, 3 injection tasks, and 2 models, but no correction for multiple comparisons is applied. No statistical tests are used at all.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors propose FATH and evaluate it against their own implementations of baseline defenses without acknowledging potential self-comparison bias.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "FATH uses significantly longer prompts with ICL examples and security policies compared to simpler baselines, but no compute cost comparison across methods is provided.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper introduces OpenPromptInjection+ but does not discuss whether this benchmark accurately represents real-world prompt injection scenarios. The Limitations section acknowledges that 'current benchmarks can not provide real tool usage scenarios' but does not formally analyze construct validity.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "FATH is a prompt-level defense, not a scaffold. No scaffolding comparison is involved.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "The paper does not discuss whether attack patterns in the benchmarks could have appeared in the training data of GPT-3.5 or Llama3, which could affect how models respond to injections.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "Not discussed. The evaluation setup could potentially leak information about the expected response format through the extensive ICL examples.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "The paper does not discuss whether the ICL demonstration examples and test examples share structural similarities that could inflate performance.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No concrete leakage detection or prevention method is applied.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "FATH achieves near 0% ASR on GPT-3.5 under all five Threat Modeling 1 attack methods across three injection task types",
    457       "evidence": "Table 2 shows 0.00 ASR for GPT-3.5+FATH across URL, QA, and CLF tasks for all non-adaptive attacks",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "FATH achieves 0% ASR against adaptive attacks on GPT-3.5 in OpenPromptInjection+ and 0% for both models in InjecAgent",
    462       "evidence": "Table 2 (GPT-3.5 adaptive attack row all zeros) and Table 3 (both Llama3 and GPT-3.5 show 0% under FATH for InjecAgent)",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "FATH outperforms all four existing test-time defense baselines",
    467       "evidence": "Table 2 shows FATH has lower ASR than Instructional, Sandwich, Isolation, and ICL defenses across nearly all conditions for both models",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "The security policy component contributes more to defense than authentication tags",
    472       "evidence": "Table 4 ablation shows removing security policy raises adaptive attack ASR by 30%+ while removing auth tags raises it by a smaller margin",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "FATH maintains 0% ASR against gradient-based optimization attacks on Llama3",
    477       "evidence": "Section 5.5 reports 0% ASR for FATH vs 70% for no-defense under optimization-based attack, with a single example output shown",
    478       "supported": "weak"
    479     },
    480     {
    481       "claim": "FATH's authentication system causes only modest degradation in generation quality",
    482       "evidence": "Judge Score drops from 8.31 (no defense) to 6.73 (FATH) for Llama3, attributed to filtering of reasoning content",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "FATH uses HMAC-based authentication tags paired with a security policy to authenticate LLM responses, filtering unauthorized (injected) outputs without model fine-tuning. On GPT-3.5, FATH achieves 0% ASR across all attack types in OpenPromptInjection+ and InjecAgent benchmarks; on Llama3, FATH substantially reduces ASR especially against adaptive attacks where baselines collapse. Ablation confirms the security policy (instructing the LLM to generate secret keys) is the more critical component, contributing over 30% ASR reduction vs authentication tags alone. A minor trade-off is observed: FATH reduces Judge Score from ~8.3 to ~6.7 due to filtering of reasoning content from final outputs.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical testing",
    493       "detail": "All ASR comparisons are reported as point estimates with no confidence intervals, p-values, or variance across runs, making it impossible to assess whether differences are reliable."
    494     },
    495     {
    496       "flag": "GPT-3.5 snapshot unspecified",
    497       "detail": "gpt-3.5-turbo is used without a snapshot date; OpenAI silently updates this model, making exact reproduction impossible."
    498     },
    499     {
    500       "flag": "Only 2 models tested",
    501       "detail": "State-of-the-art claim is based on Llama3-8B and GPT-3.5 only; no evaluation on GPT-4, Claude, Gemini, or larger open-source models."
    502     },
    503     {
    504       "flag": "LLM-as-judge quality metric",
    505       "detail": "Judge Score uses GPT-3.5 to evaluate GPT-3.5 outputs, introducing potential systematic bias; no human evaluation corroborates these scores."
    506     },
    507     {
    508       "flag": "Optimization attack limited scope",
    509       "detail": "Section 5.5 tests gradient-based attack on a single injection target ('Print exactly Hacked!') with one target output; generalization of the 0% ASR result is unclear."
    510     },
    511     {
    512       "flag": "No funding disclosure",
    513       "detail": "No acknowledgments or funding source is disclosed anywhere in the paper."
    514     },
    515     {
    516       "flag": "Contamination not discussed",
    517       "detail": "Stanford Alpaca (used as benchmark data) predates both models' training; no discussion of whether test examples appeared in training data."
    518     }
    519   ],
    520   "cited_papers": [
    521     {
    522       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    523       "relevance": "Baseline benchmark (OpenPromptInjection) and three of four baseline defense methods compared against FATH"
    524     },
    525     {
    526       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    527       "relevance": "Second evaluation benchmark testing FATH in realistic tool-use agent scenarios"
    528     },
    529     {
    530       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    531       "relevance": "Foundational work establishing indirect prompt injection as a real-world attack vector"
    532     },
    533     {
    534       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    535       "relevance": "Provides ICL defense baseline and training-time defense comparisons"
    536     },
    537     {
    538       "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting",
    539       "relevance": "Contemporary test-time defense approach that FATH positions against"
    540     },
    541     {
    542       "title": "Automatic and Universal Prompt Injection Attacks Against Large Language Models",
    543       "relevance": "Provides gradient-based optimization attack framework used in worst-case evaluation"
    544     },
    545     {
    546       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    547       "relevance": "Training-time defense approach that FATH contrasts against on cost/accessibility grounds"
    548     },
    549     {
    550       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    551       "relevance": "Source of context-ignoring and related attack methods used in Threat Modeling 1 evaluation"
    552     }
    553   ],
    554   "engagement_factors": {
    555     "practical_relevance": {
    556       "score": 2,
    557       "justification": "FATH can be applied by developers to defend LLM-integrated applications at inference time without model fine-tuning, but requires significant per-application prompt engineering."
    558     },
    559     "surprise_contrarian": {
    560       "score": 1,
    561       "justification": "The authentication-based framing is a novel angle but the core idea of using tags and ICL for defense is incremental over prior work."
    562     },
    563     "fear_safety": {
    564       "score": 2,
    565       "justification": "Addresses prompt injection (OWASP Top 1 for LLMs) and demonstrates both attacks and defenses, raising awareness of LLM security risks."
    566     },
    567     "drama_conflict": {
    568       "score": 0,
    569       "justification": "No controversy or dramatic claims; straightforward defense method comparison."
    570     },
    571     "demo_ability": {
    572       "score": 2,
    573       "justification": "Code released on GitHub with prompt templates provided in appendices, but not packaged as a pip-installable tool or live demo."
    574     },
    575     "brand_recognition": {
    576       "score": 1,
    577       "justification": "Authors from UW-Madison and NVIDIA; uses GPT-3.5 and Llama3 which are well-known but not the flagship models being tested."
    578     }
    579   },
    580   "hn_data": {
    581     "threads": [
    582       {
    583         "hn_id": "45663835",
    584         "title": "Instruction Set Migration at Warehouse Scale",
    585         "points": 3,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=45663835"
    588       }
    589     ],
    590     "top_points": 3,
    591     "total_points": 3,
    592     "total_comments": 0
    593   }
    594 }

Impressum · Datenschutz