scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22307B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LLMail-Inject: A Dataset from a Realistic Adaptive Prompt Injection Challenge",
      6     "authors": [
      7       "Sahar Abdelnabi",
      8       "Aideen Fay",
      9       "Ahmed Salem",
     10       "Egor Zverev",
     11       "Kai-Chieh Liao",
     12       "Chi-Huang Liu",
     13       "Chun-Chih Kuo",
     14       "Jannis Weigend",
     15       "Danyael Manlangit",
     16       "Alex Apostolov",
     17       "Haris Umair",
     18       "João Donato",
     19       "Masayuki Kawakita",
     20       "Athar Mahboob",
     21       "Tran Huu Bach",
     22       "Tsun-Han Chiang",
     23       "Myeongjin Cho",
     24       "Hajin Choi",
     25       "Byeonghyeon Kim",
     26       "Hyeonjin Lee",
     27       "Benjamin Pannell",
     28       "Conor McCauley",
     29       "Mark Russinovich",
     30       "Andrew Paverd",
     31       "Giovanni Cherubin"
     32     ],
     33     "year": 2025,
     34     "venue": "arXiv.org",
     35     "arxiv_id": "2506.09956",
     36     "doi": "10.48550/arXiv.2506.09956"
     37   },
     38   "checklist": {
     39     "claims_and_evidence": {
     40       "abstract_claims_supported": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "All abstract claims (208,095 unique submissions, 839 participants, multiple defense strategies tested, dataset released) are quantitatively verified in Sections 2-3 and Appendix A.",
     44         "source": "haiku"
     45       },
     46       "causal_claims_justified": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes causal-adjacent claims such as 'spotlighting reduces the Tool Call rate' and 'stacking all defenses provides a significant improvement,' but the data is purely observational (competition submissions) with no controlled experimental design to support causal inference.",
     50         "source": "haiku"
     51       },
     52       "generalization_bounded": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Results are consistently framed within the challenge setting; broader claims (e.g., 'We need benchmarks for end-to-end attacks') are presented as recommendations rather than empirical conclusions, and the restricted attack objectives are explicitly acknowledged as a scope limitation.",
     56         "source": "haiku"
     57       },
     58       "alternative_explanations_discussed": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Alternative explanations are only briefly noted with hedged language (e.g., 'potentially due to instruction hierarchy training,' 'may be due to the model not properly processing complex formatting') without systematic consideration of competing hypotheses for the observed defense efficacy differences.",
     62         "source": "haiku"
     63       },
     64       "proxy_outcome_distinction": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper clearly distinguishes 'Tool Call' (proxy: send_email invoked regardless of detection) from 'E2E Attack Success' (full success: tool called with correct arguments and defense evaded), using both metrics separately throughout Section 4.",
     68         "source": "haiku"
     69       }
     70     },
     71     "limitations_and_scope": {
     72       "limitations_section_present": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 6 is titled 'Limitations and Safety Impact' and provides substantive discussion beyond a single sentence.",
     76         "source": "haiku"
     77       },
     78       "threats_to_validity_specific": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Section 6 identifies specific threats: Phi-3 non-determinism due to API limitations (causing exclusion of certain sub-levels), reliance on LLM annotator for a large subset of labels without ground truth, and restricted attack objectives limiting diversity of attack styles.",
     82         "source": "haiku"
     83       },
     84       "scope_boundaries_stated": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper explicitly states 'we do not recommend directly training on this dataset' and notes the benchmark is scoped to indirect prompt injection in a simulated email assistant with fixed attack objectives, not arbitrary real-world systems.",
     88         "source": "haiku"
     89       }
     90     },
     91     "conflicts_of_interest": {
     92       "funding_disclosed": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No funding source or grant acknowledgment appears anywhere in the paper; the acknowledgments thank individuals for help but disclose no funding agency or institutional support.",
     96         "source": "haiku"
     97       },
     98       "affiliations_disclosed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Author affiliations are fully listed on the first page: Microsoft, ISTA, Trend Micro, RainaResearch, University of Coimbra, Vietnamese German University, SK Shieldus, and HiddenLayer.",
    102         "source": "haiku"
    103       },
    104       "funder_independent_of_outcome": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "Microsoft employees are core organizers and the paper evaluates Microsoft's own Prompt Shield and TaskTracker defenses (cited as 'Microsoft. Prompt shields. 2024'), creating a direct conflict between organizer and evaluated product.",
    108         "source": "haiku"
    109       },
    110       "financial_interests_declared": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "There is no competing interests statement, no declaration of patents or equity stakes, and no disclosure of financial relationships beyond institutional affiliation.",
    114         "source": "haiku"
    115       }
    116     },
    117     "scope_and_framing": {
    118       "key_terms_defined": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Key terms are defined: 'indirect prompt injection' (Section 1), 'end-to-end attack success' vs. 'tool call' (Figure 2 caption), 'Team Success Rate' (Section 4.3, with formal definition), and 'sub-level' (Section 2.4).",
    122         "source": "haiku"
    123       },
    124       "intended_contribution_clear": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract explicitly states the contribution: releasing (1) the challenge code, (2) the full dataset of 208,095 unique submissions, and (3) analysis demonstrating new insights into instruction-data separation, framed as a foundation for future defense research.",
    128         "source": "haiku"
    129       },
    130       "engagement_with_prior_work": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 5 (Related Work) engages substantively with prior competitions (Gandalf/Pfister et al., HackAPrompt/Schulhoff et al., SaTML CTF/Debenedetti et al.) and explains specific differentiators: indirect (not direct) injection, tool-calling capability, and adaptive adversaries.",
    134         "source": "haiku"
    135       }
    136     }
    137   },
    138   "type_checklist": {
    139     "benchmark-creation": {
    140       "construct_design": {
    141         "construct_validity_argued": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Section 7 explicitly argues why end-to-end evaluation in a realistic email pipeline measures real-world attack capability better than prior benchmarks, noting that 'real-world attacks are typically more complex than evading a defense or causing an LLM to perform a straightforward task.'",
    145           "source": "haiku"
    146         },
    147         "difficulty_distribution_characterized": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Four difficulty levels are designed with specific retrieval configurations and measured empirically via Team Success Rate (Figure 4b) and average trials before first success (Table 2), providing quantitative characterization of difficulty.",
    151           "source": "haiku"
    152         },
    153         "ceiling_floor_effects_checked": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper reports TSR values (some as low as 0.166 for 'All' defenses combined) but does not explicitly discuss ceiling or floor effects as a benchmark design concern or assess whether some sub-levels discriminate poorly.",
    157           "source": "haiku"
    158         },
    159         "human_baseline_included": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "There is no human baseline for what a naive or expert human attacker would achieve; participants ARE humans, but no baseline comparison to automated attack generation or naive human performance is provided.",
    163           "source": "haiku"
    164         },
    165         "scoring_rubric_justified": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Appendix G provides detailed justification for the scoring algorithm with four explicit requirements (stable output, order-based points, difficulty scaling, tie-breaking) and rationale for parameters β=0.95 and min_threshold=0.75.",
    169           "source": "haiku"
    170         }
    171       },
    172       "robustness": {
    173         "contamination_resistance_designed": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Phase 2 implements a Conformal Blocklist (Appendix H) that blocks Phase 1 attacks and their paraphrases, with theoretical guarantees; a random suffix is also added to the tool name to prevent hardcoded attack payloads.",
    177           "source": "haiku"
    178         },
    179         "temporal_robustness_discussed": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The Data Card notes 'limited maintenance' with no major updates planned, and future iterations will be 'released as new datasets,' but there is no substantive discussion of how quickly this benchmark will be gamed or obsoleted by model improvements.",
    183           "source": "haiku"
    184         },
    185         "failure_modes_discussed": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Section 6 and Section 7 discuss benchmark failure modes: restricted attack objectives limiting diversity, non-determinism in Phi-3 requiring exclusion of sub-levels, LLM annotator replacing ground truth for a large subset, and the note that direct training on the dataset is not recommended.",
    189           "source": "haiku"
    190         },
    191         "baseline_implementations_provided": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Challenge code is released on GitHub (microsoft/llmail-inject-challenge), dataset on HuggingFace, and analysis code on a separate GitHub repo, enabling reproduction of reported numbers.",
    195           "source": "haiku"
    196         }
    197       },
    198       "documentation": {
    199         "dataset_documentation_complete": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Section A.2 follows the Pushkarna et al. Data Card format with dataset ownership, overview, risk and mitigation, example data point, sensitive attributes, data distributions, and maintenance plan; collection methodology is described throughout Sections 2-3.",
    203           "source": "haiku"
    204         },
    205         "licensing_and_access_clear": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "The dataset is published under the MIT license on HuggingFace (URL provided in footnote 1), with competition rules governing participant consent to data release reproduced in full in Appendices A.2 and B.",
    209           "source": "haiku"
    210         },
    211         "intended_use_specified": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Section 6 explicitly states 'we do not recommend directly training on this dataset' with justification, while Section A.2 specifies intended use as evaluating defenses and supporting defensive security research.",
    215           "source": "haiku"
    216         }
    217       }
    218     }
    219   },
    220   "claims": [
    221     {
    222       "claim": "Only 0.8% of Phase 1 submissions (3,018 of 370,724) resulted in successful end-to-end attacks",
    223       "evidence": "Section 3 statistics confirmed by multi-stage outcome diagram Figure 3a showing 3,018 ContentOk successes from 370,724 total submissions",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "LLM Judge had the highest detection rate among individual defenses (99.4% recall on Phase 1 attacks)",
    228       "evidence": "Table 3 shows LLM Judge recall of 0.994 (Phase 1) and 0.965 (Phase 2), highest among all individual defenses tested",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Combining all defenses provides substantial improvement over any individual defense",
    233       "evidence": "Table 4 shows combined defenses achieving 0.996-0.998 recall vs. best individual (LLM Judge: 0.994); Table 1 shows TSR drops to 0.166 for All vs. 0.318-0.600 for individual defenses",
    234       "supported": "moderate"
    235     },
    236     {
    237       "claim": "GPT-4o-mini sub-levels were harder to attack than Phi-3 sub-levels",
    238       "evidence": "Figure 4a shows lower Team Success Rate for GPT-4 than Phi-3, consistent across sub-level subsets in Figure 17; attributed to instruction hierarchy training",
    239       "supported": "strong"
    240     },
    241     {
    242       "claim": "TaskTracker performance improves significantly with more recent underlying LLMs",
    243       "evidence": "Table 3 shows TaskTracker Phi-3 14B recall 0.761, Phi-3.5 MoE 0.949, Phi-4 14B 0.881 (Phase 1); Phase 2 shows Phi-4 14B reaching 0.998",
    244       "supported": "moderate"
    245     },
    246     {
    247       "claim": "Many successful attacks use 'declarative sentences' rather than explicit instructions to the LLM",
    248       "evidence": "Section 7 qualitative observation and example in text, but no systematic quantification of what fraction of successful attacks were declarative vs. explicit",
    249       "supported": "weak"
    250     },
    251     {
    252       "claim": "Attacks that succeed against the hardest defense tend to transfer to easier defenses",
    253       "evidence": "Winning team strategies in Appendix I consistently report transfer (e.g., 'prompts that work for LLM-as-a-Judge also work in other scenarios'), and Table 2 shows knowledge accumulation across levels",
    254       "supported": "moderate"
    255     }
    256   ],
    257   "methodology_tags": [
    258     "benchmark-eval",
    259     "observational"
    260   ],
    261   "key_findings": "The LLMail-Inject challenge produced 208,095 unique adaptive prompt injection attacks from 839 participants, with only 0.8% achieving end-to-end success in Phase 1 and 0.3% in Phase 2. LLM Judge was the most effective single defense (99.4% detection recall) while Prompt Shield was most permissive; stacking all defenses dramatically reduced attack success (TSR=0.166 for the hardest level). Successful attacks frequently exploited special tokens, multilingual prompts, and 'declarative' social-engineering framing rather than explicit commands, revealing that existing defenses struggle with context-sensitive instruction detection. Phase 2 demonstrated that updating defenses with Phase 1 data yields measurable improvements, validating the adaptive challenge design.",
    262   "red_flags": [
    263     {
    264       "flag": "Organizer-evaluates-own-product",
    265       "detail": "Microsoft employees are core competition organizers while two of the tested defenses (Prompt Shield, TaskTracker) are Microsoft products or research outputs, creating an undisclosed conflict of interest with no competing interests statement."
    266     },
    267     {
    268       "flag": "LLM annotator as ground truth",
    269       "detail": "104,583 of ~169,598 Phase 1 submissions are labeled using an LLM annotator rather than human ground truth; the paper acknowledges but does not quantify the error rate of this annotation, and a known false-negative example is shown in Appendix J."
    270     },
    271     {
    272       "flag": "Phi-3 sub-levels selectively excluded",
    273       "detail": "Phi-3 spotlighting sub-levels were excluded from analysis and Phase 2 due to non-determinism and degraded utility, potentially biasing comparative defense results toward conditions where defenses appear more effective."
    274     },
    275     {
    276       "flag": "No statistical significance testing",
    277       "detail": "Defense efficacy comparisons (TSR, recall differences between defenses) are reported without confidence intervals or p-values; the standard deviations in Table 2 are very large (e.g., 668.6 for a mean of 156.6), suggesting high variance that undermines point estimates."
    278     },
    279     {
    280       "flag": "Two LLMs only",
    281       "detail": "All defense evaluations are based on only two LLMs (Phi-3-medium and GPT-4o-mini), making it unclear how results generalize to other model families, sizes, or instruction-tuning paradigms."
    282     }
    283   ],
    284   "cited_papers": [
    285     {
    286       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    287       "relevance": "Foundational paper on indirect prompt injection attacks that motivates the challenge design"
    288     },
    289     {
    290       "title": "Defending against Indirect Prompt Injection Attacks with Spotlighting",
    291       "relevance": "One of the primary defenses evaluated in the challenge; spotlighting baseline implementation"
    292     },
    293     {
    294       "title": "Dataset and Lessons Learned from the 2024 SaTML LLM Capture-the-Flag Competition",
    295       "relevance": "Prior related competition dataset; directly compared to in Related Work section"
    296     },
    297     {
    298       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    299       "relevance": "Prior agentic prompt injection benchmark used for contextualizing LLMail-Inject's contribution"
    300     },
    301     {
    302       "title": "Ignore This Title and HackAPrompt: Exposing Systemic Vulnerabilities of LLMs through a Global Prompt Hacking Competition",
    303       "relevance": "Prior competition dataset for direct prompt injection; Related Work comparison"
    304     },
    305     {
    306       "title": "Get My Drift? Catching LLM Task Drift with Activation Deltas",
    307       "relevance": "Source paper for the TaskTracker defense evaluated in the challenge"
    308     },
    309     {
    310       "title": "Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?",
    311       "relevance": "Related benchmark for data-instruction separation; positions LLMail-Inject in the evaluation landscape"
    312     },
    313     {
    314       "title": "Gandalf the Red: Adaptive Security for LLMs",
    315       "relevance": "Prior prompt injection competition (direct, not indirect) used for direct comparison in Related Work"
    316     },
    317     {
    318       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    319       "relevance": "Explains GPT-4o-mini's instruction hierarchy training, used to interpret differential attack success rates"
    320     },
    321     {
    322       "title": "Data Cards: Purposeful and Transparent Dataset Documentation for Responsible AI",
    323       "relevance": "Framework used for the dataset documentation (Data Card format) in Appendix A.2"
    324     }
    325   ],
    326   "engagement_factors": {
    327     "practical_relevance": {
    328       "score": 3,
    329       "justification": "Full dataset (208K submissions), challenge code, and analysis released publicly on HuggingFace and GitHub, directly usable by security practitioners building or evaluating prompt injection defenses."
    330     },
    331     "surprise_contrarian": {
    332       "score": 2,
    333       "justification": "Counter-intuitive finding that Level 1 (only 2 emails, injection at end of context) was harder than Level 2 for GPT-4, and that many successful attacks used 'declarative sentences' indistinguishable from normal emails rather than explicit commands."
    334     },
    335     "fear_safety": {
    336       "score": 3,
    337       "justification": "Directly addresses real-world agentic LLM security with tool-calling exploitation; demonstrates that sophisticated attackers can achieve end-to-end exfiltration with only hundreds of attempts against deployed defenses."
    338     },
    339     "drama_conflict": {
    340       "score": 1,
    341       "justification": "Competition format with prizes and leaderboard adds some drama, but no major controversy or contradictory findings; paper is primarily descriptive."
    342     },
    343     "demo_ability": {
    344       "score": 3,
    345       "justification": "Dataset publicly available on HuggingFace, challenge code on GitHub; others can immediately run the evaluation pipeline and test new defenses against the collected attacks."
    346     },
    347     "brand_recognition": {
    348       "score": 2,
    349       "justification": "Microsoft is a recognizable brand and the paper involves Microsoft products (Prompt Shield, Phi-3), but this is not a flagship consumer-facing product; IEEE SaTML affiliation adds academic credibility."
    350     }
    351   },
    352   "hn_data": {
    353     "threads": [
    354       {
    355         "hn_id": "46494285",
    356         "title": "KGGen: Extracting Knowledge Graphs from Plain Text with Language Models",
    357         "points": 20,
    358         "comments": 4,
    359         "url": "https://news.ycombinator.com/item?id=46494285"
    360       },
    361       {
    362         "hn_id": "44567853",
    363         "title": "The Mythical Good Software",
    364         "points": 4,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=44567853"
    367       },
    368       {
    369         "hn_id": "42792507",
    370         "title": "A Multi-Agent System for Hybrid Optimization",
    371         "points": 3,
    372         "comments": 0,
    373         "url": "https://news.ycombinator.com/item?id=42792507"
    374       },
    375       {
    376         "hn_id": "36427198",
    377         "title": "The temporal dynamics of group interactions in higher-order social networks",
    378         "points": 3,
    379         "comments": 0,
    380         "url": "https://news.ycombinator.com/item?id=36427198"
    381       },
    382       {
    383         "hn_id": "44948011",
    384         "title": "Invertible Syntax Without the Tuples (Functional Pearl)",
    385         "points": 2,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=44948011"
    388       },
    389       {
    390         "hn_id": "43361360",
    391         "title": "A Survey of Long Chain-of-Thought for Reasoning Large Language Models",
    392         "points": 2,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=43361360"
    395       },
    396       {
    397         "hn_id": "44276386",
    398         "title": "Resa: Transparent Reasoning Models via SAEs",
    399         "points": 1,
    400         "comments": 0,
    401         "url": "https://news.ycombinator.com/item?id=44276386"
    402       },
    403       {
    404         "hn_id": "43430336",
    405         "title": "Empowering LLMs for Time Series Forecasting with Temporal Patterns and Semantics",
    406         "points": 1,
    407         "comments": 0,
    408         "url": "https://news.ycombinator.com/item?id=43430336"
    409       }
    410     ],
    411     "top_points": 20,
    412     "total_points": 36,
    413     "total_comments": 4
    414   }
    415 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs