ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (24730B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "FATH: Authentication-based Test-time Defense against Indirect Prompt Injection Attacks",
      6     "authors": [
      7       "Jiong Wang",
      8       "Fangzhou Wu",
      9       "Wen-Ding Li",
     10       "Jinsheng Pan",
     11       "Edward Suh"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2410.21492",
     16     "doi": "10.48550/arXiv.2410.21492"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of near-0% ASR and state-of-the-art performance are supported by Tables 2 and 3 showing near-zero ASR for GPT-3.5 and consistently low ASR for Llama3 under Threat Modeling 1.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper includes an ablation study (Table 4) removing Authentication Tags and Security Policy individually, providing causal evidence that each component contributes to defense performance.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims FATH provides 'an efficient way for developers to secure their LLM-integrated applications' broadly, but experiments cover only two models (Llama3-8B, GPT-3.5), two benchmarks, and simulated (not real) tool usage — scope is understated.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes FATH's success to authentication preventing instruction confusion, but does not discuss alternative explanations such as the role of in-context examples alone or whether HMAC tags are necessary vs. simpler random tokens.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Attack Success Rate directly measures whether injected instructions are executed, which aligns with the claimed defense objective; Judge Score separately measures utility — the paper distinguishes these two outcomes.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "There is a dedicated 'Limitations' section listing three specific limitations: manual prompt design effort, reliance on strong instruction-following, and unrealistic benchmark tool-usage scenarios.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Limitations are specific: reliance on strong instruction-following is illustrated by mentioning Alpaca as a weaker model that would fail, and benchmark limitations are tied to simulated vs. real tool execution scenarios.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "While limitations are noted, no explicit statement bounds what results do NOT show — e.g., the paper does not state results don't cover direct prompt injection, stronger LLMs, or enterprise-scale deployments.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are listed in the header (UW-Madison, HUST, U Rochester, NVIDIA, Cornell, U Michigan, UC Davis).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Indirect prompt injection attacks are formally defined in Section 4.1 with mathematical notation; HMAC authentication is referenced to RFC 2104; ASR is defined in Section 5.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states FATH is a novel test-time defense mechanism using HMAC-based authentication tags, positioned as overcoming limitations of existing training-time and test-time defenses.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 surveys LLM-integrated applications, prompt injection attacks, and defenses; the paper directly compares against four prior test-time methods (Instructional, Sandwich, Isolation, ICL) and explains why they fail against adaptive attacks.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is released at https://github.com/Jayfeather1024/FATH as stated in the abstract.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All datasets used are from publicly available sources: Stanford Alpaca (Apache-2.0), OpenPromptInjection (CC BY 4.0), InjecAgent (MIT), and Faker package (MIT) — no proprietary data.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions 1x NVIDIA A100 GPU and specific model versions but provides no requirements.txt, Dockerfile, or package version specifications.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Code is released but no step-by-step reproduction instructions appear in the paper; appendices contain prompt templates but not pipeline execution guidance.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 2-4 report only point estimates for ASR with no confidence intervals or error bars across repeated runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are reported for any comparative claims despite quantitative comparisons across 5 attack methods and 5 defense methods.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "ASR is reported as a proportion (0.00-1.00) with baseline comparisons visible in the same table, effectively conveying effect sizes (e.g., reduction from 0.60 to 0.00).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "100 examples per task category from Stanford Alpaca are selected with no power analysis or justification for why this sample size is sufficient.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or multiple-run results are reported; all ASR values appear to be from single evaluation runs.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four baseline defense methods (Instructional Prevention, Sandwich Prevention, Text Instruction Isolation, ICL Defense) plus No Defense are included for comparison.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines are from 2023 (Liu et al., Yi et al.), which are the most recent published test-time defenses at the time of submission (Oct 2024).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 5.6 conducts ablation by individually removing Authentication Tags and Security Policy, reported in Table 4.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Both Attack Success Rate (security) and Judge Score (utility/quality) are reported, measuring defense effectiveness and utility cost simultaneously.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not relevant for this security defense paper where attack success is objectively determinable via automated metric (ASR).",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "FATH is a prompting-only method with no training; test examples (100 per task from Stanford Alpaca, 510 from InjecAgent) are distinct evaluation sets not used for any optimization.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 2 breaks down results by injection task type (URL, QA, CLF) and by model (Llama3, GPT-3.5) for all attack and defense combinations.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Table 2 shows FATH's Llama3 adaptive attack failure (26-34% ASR), and the limitations section discusses failure modes for weaker instruction-following models.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The Judge Score drops noticeably (8.31→6.73 for Llama3, 7.94→6.91 for GPT-3.5) and the Llama3 adaptive attack shows non-zero ASR — both are reported without suppression.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model identifiers are provided: 'Meta-Llama-3-8B-Instruct' and 'gpt-3.5-turbo'.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt templates for FATH, all baseline defenses, and all attack methods are included in Appendices (Figures 3-8, Tables 6-9) with placeholders clearly marked.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "'We set all parameters to default for model generation' is insufficient — temperature, top-p, and other generation parameters are not specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The authentication system is fully described in Section 4 with formal notation, including input formatting, security policy prompting, and verification parsing.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix G lists all datasets with licenses; Section 5.1 describes selection criteria (Stanford Alpaca examples with both 'instruction' and 'input' fields used as user instruction and external text).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "All source datasets are publicly available (Stanford Alpaca, OpenPromptInjection, InjecAgent) and the code repo is released, making raw evaluation data recoverable.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 5.1 and Appendix G describe the construction of OpenPromptInjection+ including data sources, task categories, and selection criteria for each component.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participant recruitment — evaluation uses standard benchmark datasets.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from raw datasets to evaluation is documented: Stanford Alpaca examples selected as target tasks, injection tasks sourced from three categories, combined with specific attack templates from Appendix C.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for GPT-3.5 and Llama3 are not stated; Stanford Alpaca (2023) and OpenPromptInjection data may have been seen during model training, potentially inflating defense performance.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether test injection examples or attack templates appear in the training data of GPT-3.5 or Llama3.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Stanford Alpaca (2023) and OpenPromptInjection examples were publicly available before training cutoffs for both evaluated models; this is not acknowledged.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No latency, token cost, or per-query overhead is reported for FATH despite this being directly relevant for adoption as a practical defense.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware (1x A100) is mentioned but no total compute budget, wall-clock time, or API cost estimate is provided.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "FATH achieves near-0% attack success rate against all five Threat Modeling 1 attack methods on both Llama3 and GPT-3.5 under OpenPromptInjection+",
    375       "evidence": "Table 2 shows FATH ASR ≤0.10 across all Llama3 cases and ≤0.02 across all GPT-3.5 cases for Naive/Escape/Context/Fake/Combined attacks",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "FATH achieves 0% ASR on the InjecAgent tool-usage benchmark against both Combined and Adaptive attacks for both models",
    380       "evidence": "Table 3 shows 0.00 ASR for FATH across Llama3 and GPT-3.5 on InjecAgent, compared to 99.3% and 1.00 without defense",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "FATH outperforms all existing test-time defense methods against adaptive attacks",
    385       "evidence": "Table 2 adaptive attack column shows FATH achieves 0% on GPT-3.5 while all baselines have 33-99%+ ASR; Llama3 shows 26-34% for FATH vs higher for most baselines",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Security Policy is the most critical component, with its removal causing >30% increase in ASR under adaptive attacks",
    390       "evidence": "Table 4 shows 'w/o Security Policy' adaptive attack ASR of 34-56% vs FATH's 0%, while 'w/o Authentication Tags' shows 6-18%",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "FATH achieves 0% ASR against optimization-based (gradient-based) attacks on Llama3, while no-defense achieves 70% ASR",
    395       "evidence": "Section 5.5 reports this result with a single injection target ('Print exactly Hacked!') using EoT over 5 tag sets",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "FATH introduces minimal utility cost, with only small decrease in Judge Score",
    400       "evidence": "Judge Score drops from 8.31 to 6.73 (Llama3) and 7.94 to 6.91 (GPT-3.5) — a 19% and 13% decrease respectively, which is non-trivial",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "FATH uses HMAC-based authentication tags and a security policy prompt to force LLMs to label all responses with authorized/unauthorized markers, then filters outputs via rule-based parsing. On GPT-3.5, FATH reduces ASR to 0% across all tested attack types including adaptive attacks. On Llama3, FATH reduces ASR to near-0% for non-adaptive attacks but shows residual vulnerability (26-34% ASR) under adaptive attacks. The method generalizes to the InjecAgent tool-usage benchmark achieving 0% ASR on both models. A notable utility cost is observed (Judge Score drops ~15-19%), attributed to filtering of reasoning content.",
    408   "red_flags": [
    409     {
    410       "flag": "No error bars or significance tests",
    411       "detail": "All results are single-run point estimates with no confidence intervals, standard deviations, or statistical tests — results across 100 examples cannot be evaluated for statistical reliability."
    412     },
    413     {
    414       "flag": "Llama3 adaptive attack failure understated",
    415       "detail": "The abstract claims 'significantly lowers the ASR' under Llama3 adaptive attacks, but Table 2 shows 26-34% ASR for adaptive attacks on Llama3 — this is a substantial failure mode that contradicts the near-0% framing."
    416     },
    417     {
    418       "flag": "Optimization attack tested on single target only",
    419       "detail": "The gradient-based worst-case attack (Section 5.5) uses only one injection target ('Print exactly Hacked!') with one sample — insufficient to establish general robustness."
    420     },
    421     {
    422       "flag": "No contamination discussion",
    423       "detail": "Stanford Alpaca and OpenPromptInjection test data predates both model training cutoffs, and the paper does not discuss whether these examples appeared in training data."
    424     },
    425     {
    426       "flag": "Author-created benchmark evaluated by same authors",
    427       "detail": "OpenPromptInjection+ is constructed by the paper's authors and used to evaluate FATH — the benchmark design choices could inadvertently favor the proposed defense."
    428     },
    429     {
    430       "flag": "Generation hyperparameters unspecified",
    431       "detail": "'All parameters set to default' does not specify temperature, top-p, or max tokens — results may not be reproducible if API defaults change."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    437       "relevance": "Primary baseline: provides OpenPromptInjection benchmark and Instructional/Sandwich/Isolation defense methods compared against FATH"
    438     },
    439     {
    440       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    441       "relevance": "Provides ICL Defense baseline and training-time defense with special tokens; key prior work FATH positions against"
    442     },
    443     {
    444       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    445       "relevance": "Provides the InjecAgent benchmark used for tool-usage evaluation in Section 5"
    446     },
    447     {
    448       "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting",
    449       "relevance": "Concurrent test-time defense work using text transformations to distinguish user vs. external content"
    450     },
    451     {
    452       "title": "Automatic and Universal Prompt Injection Attacks Against Large Language Models",
    453       "relevance": "Provides the optimization-based attack framework used for worst-case evaluation in Section 5.5"
    454     },
    455     {
    456       "title": "StruQ: Defending Against Prompt Injection with Structured Queries",
    457       "relevance": "Training-time defense comparison showing the impracticality of fine-tuning approaches"
    458     },
    459     {
    460       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    461       "relevance": "Seminal work establishing the indirect prompt injection threat model"
    462     },
    463     {
    464       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    465       "relevance": "Provides the agent architecture used in InjecAgent benchmark scenarios"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 2,
    471       "justification": "Addresses a real and growing security threat for LLM-integrated applications with a code-released, prompt-only approach developers can deploy without model access."
    472     },
    473     "surprise_contrarian": {
    474       "score": 1,
    475       "justification": "Applying HMAC authentication concepts to LLM prompt security is a creative reframing, but the core idea of structured output filtering is incremental."
    476     },
    477     "fear_safety": {
    478       "score": 2,
    479       "justification": "Directly addresses OWASP Top 1 for LLM applications with concrete attack demonstrations including financial transactions and home automation exploitation."
    480     },
    481     "drama_conflict": {
    482       "score": 1,
    483       "justification": "Security arms race framing (adaptive attacks defeating baselines) creates mild drama but the paper is primarily a technical defense contribution."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Code is publicly released on GitHub and the method requires only prompt engineering — practitioners can test it against their own applications."
    488     },
    489     "brand_recognition": {
    490       "score": 0,
    491       "justification": "Multi-institutional academic paper (UW-Madison, Cornell, NVIDIA affiliation) without major lab branding or famous author names."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "45663835",
    498         "title": "Instruction Set Migration at Warehouse Scale",
    499         "points": 3,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=45663835"
    502       }
    503     ],
    504     "top_points": 3,
    505     "total_points": 3,
    506     "total_comments": 0
    507   }
    508 }

Impressum · Datenschutz