ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26516B)


      1 {
      2   "paper": {
      3     "title": "Attacks by Content: Automated Fact-checking is an AI Security Issue",
      4     "authors": ["Michael Schlichtkrull"],
      5     "year": 2025,
      6     "venue": "Conference on Empirical Methods in Natural Language Processing (EMNLP 2025)",
      7     "arxiv_id": "2510.11238",
      8     "doi": "10.18653/v1/2025.emnlp-main.431"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "A GitHub repository is provided: https://github.com/MichSchli/AgentCogSec (footnote 1, Section 1). The paper also states the attack dataset is available at that repository (Appendix B)."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The full dataset of 60 attack scenarios is released via the GitHub repository at https://github.com/MichSchli/AgentCogSec/attack_dataset.json (Appendix B)."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. The paper does not describe how to set up the experimental environment."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "While the prompts are provided in Appendix A and the data is released, there are no step-by-step reproduction instructions, no README with commands to run, and no scripts to replicate the experiments."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '78.3%', '90.0%') with no confidence intervals, error bars, or uncertainty measures."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims (e.g., 'the clear stand-out is Llama 3.1 8b', 'vulnerability rates are significantly different across the three conditions' in Appendix D) but no statistical significance tests are reported."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Vulnerability rates are reported as percentages with baseline context across conditions (baseline, fact-check, source warning, both), allowing readers to compute the magnitude of defense effectiveness. E.g., Claude Sonnet 4 goes from 61.7% baseline to 15.0% with fact-check to 0.0% with both defenses (Table 1)."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample size is 60 scenarios (10 per area of concern), but no justification for why 60 scenarios or 10 per category is sufficient is provided. No power analysis is discussed."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported. Each scenario appears to be run once per model. No repeated runs are mentioned."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Table 1 includes a 'Baseline' column (no protection) compared against three defense conditions (fact-check, source warning, both). Multiple models serve as baselines against each other."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The models tested are highly contemporary: GPT-4.1, o4-mini-high, Claude Sonnet 4, Claude Opus 4, Gemini 2.5 Pro/Flash, Llama 3.3, Qwen 3, DeepSeek R1, Grok 3. These represent state-of-the-art models as of 2025."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The experimental design includes ablation-like comparisons: baseline (no defense), fact-check only, source warning only, and both combined. Appendix D further ablates the Grok system prompt with and without epistemological warning lines."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "Only one metric is used: vulnerability rate (percentage of attacks the model passes on to users). No other metrics such as false positive rate (rejecting legitimate information) are measured."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper evaluates whether LLMs are vulnerable to content attacks, which involves subjective quality judgments about agent behavior. No human evaluation of whether the model's decisions were correct or reasonable is included. The scenarios were manually created/edited but there is no human evaluation of model outputs."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a machine learning training/testing study. The 60 scenarios are evaluated directly with no tuning/selection process requiring train/test separation."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Table 2 provides per-category breakdowns across all 6 areas of concern (Charity, Finance, Healthcare, Law, Politics, Useless products) for each model."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses failure patterns: charity fraud is identified as the most problematic category where almost all attacks succeeded against all models (Section 5, Table 2). The finding that larger models are not necessarily less vulnerable is also a failure-case discussion."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Several negative findings are reported: defenses that reduce vulnerability significantly still leave many models vulnerable; fact-checking is not sufficient for all cases (Limitations section notes 60-65% accuracy of current systems); charity fraud remains extremely difficult to defend against."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims are: (1) injection of instructions is not necessary to manipulate agents (supported by Table 1 baseline column), (2) existing defenses are ineffective against attacks by content (supported by the conceptual argument in Section 2), (3) fact-checking is proposed as a defense (supported by Table 1 showing reduced vulnerability with fact-checks). Claims are appropriately hedged."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims about defenses reducing vulnerability (e.g., 'Fact-checking and source warnings were both highly effective defense strategies'). These are supported by controlled experimental manipulation — identical scenarios tested with and without each defense, holding all else constant. This is adequate for causal inference about the defense's effect."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper generalizes broadly from a narrow experimental setup. The title claims 'Automated Fact-checking is an AI Security Issue' broadly, but the experiment tests only a simulated research agent on 60 hand-crafted scenarios using a simple yes/no prompt. The paper does not adequately bound generalizations to this specific setting — it uses language like 'agents must critically evaluate retrieved information' as a general claim based on this limited evidence."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the observed vulnerability rates. For instance, the vulnerability might be an artifact of the specific prompt framing (asking 'should you pass this on' may bias toward YES), the synthetic nature of the scenarios, or the simplicity of the agent setup. The Limitations section acknowledges fact-checking limitations but does not discuss alternative explanations for the experimental results."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Table 1 lists specific model identifiers: meta-llama/llama-3.1-8b-instruct, meta-llama/llama-3.1-70b-instruct, openai/gpt-4.1, openai/o4-mini-high, anthropic/claude-sonnet-4, anthropic/claude-opus-4, google/gemini-2.5-flash, google/gemini-2.5-pro, deepseek/deepseek-r1-0528, etc. These are API-level model identifiers sufficient for reproduction."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The full prompt templates are provided in Appendix A (Figures 5 and 6), including the exact text with placeholders for task, attack_text, fact_check, and source_warning. An example data point filling these placeholders is provided in Appendix B (Figure 7). The full dataset is also released on GitHub."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters are reported — temperature, top-p, max tokens, and other sampling settings are not mentioned despite using multiple LLM APIs."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "The experimental setup uses a simple single-prompt, single-response design (Figures 5 and 6). There is no agentic scaffolding — the LLM is given a prompt and asked for a yes/no response."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4 describes the data creation process: 'Each scenario was created by initially generating a fictional scenario using Claude 4 Opus, which we then manually edited for plausibility.' The 60 scenarios are distributed as 10 per area of concern (Section 5). The full dataset is released."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 8 is a dedicated 'Limitations' section discussing that automated fact-checking does not guarantee protection, current systems correctly verify only 60-65% of claims, and attackers might circumvent protections."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The Limitations section raises specific threats: (1) current fact-checking systems only verify 60-65% of real-world claims, (2) attackers might inject adversarial data into evidence sources, (3) claims might be crafted to be adversarial against fact-checking systems. The Ethics section (Section 9) also raises specific concerns about bias in fact-checking tools."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what results do NOT show. It does not bound the findings to the specific experimental setup (60 synthetic scenarios, simple yes/no agent, specific prompt design). The Limitations section discusses fact-checking limitations but not the scope boundaries of the empirical findings."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The attack dataset is released on GitHub (https://github.com/MichSchli/AgentCogSec/attack_dataset.json, Appendix B). However, the raw model outputs (YES/NO responses and reasoning) are not mentioned as being available."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4 describes the data collection: 60 scenarios across 6 areas of concern, 10 per area, initially generated by Claude 4 Opus then manually edited for plausibility."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved in the study. The study evaluates LLM responses to synthetic scenarios."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper describes scenario creation but does not document the full data pipeline: how model outputs were collected, how vulnerability rates were computed from raw outputs, whether any responses were ambiguous or excluded. The step from model response to vulnerability rate percentage is not fully documented."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Acknowledgments section states: 'This work was supported by the Engineering and Physical Sciences Research Council [grant number EP/Y009800/1], through funding from Responsible AI UK (KP0016).'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The author's affiliation is clearly stated: School of Electronic Engineering and Computer Science, Queen Mary University of London. The author is not affiliated with any of the model providers being tested."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder (EPSRC / Responsible AI UK) is a UK government research council with no financial stake in the vulnerability of any specific model or the effectiveness of fact-checking as a defense."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The experiment does not evaluate pre-trained models on a benchmark testing their knowledge or capabilities in a way that could be contaminated by training data. The scenarios are synthetic and test agent behavior (whether to pass information to users), not model knowledge."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The experimental setup tests agent decision-making on novel synthetic scenarios, not model knowledge on existing benchmarks. Contamination is structurally irrelevant."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The 60 scenarios were newly created for this paper using Claude 4 Opus and then manually edited. There is no pre-existing benchmark that could be contaminated."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants were involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The paper tests 20 models across 60 scenarios with 4 conditions each (4,800 API calls minimum) but reports no inference cost, token consumption, or API spending."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No total computational budget, API costs, or hardware used are mentioned."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "LLM-based agents are highly vulnerable to attacks by content (malicious data rather than malicious instructions), with baseline vulnerability rates ranging from 61.7% to 98.3% across 20 models.",
    287       "evidence": "Table 1 shows baseline vulnerability rates for 20 models on 60 attack scenarios. The lowest is Claude Sonnet 4 at 61.7% and the highest are Cohere Command R Plus and Command A at 98.3%.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Fact-checking and source warnings are effective defenses against attacks by content, with the combination reducing vulnerability rates drastically.",
    292       "evidence": "Table 1 shows that combining fact-checks and source warnings reduces vulnerability for most models (e.g., Llama 3.1 8b from 78.3% to 0.0%, GPT-4.1 from 90.0% to 6.7%). However, no statistical tests are provided.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Charity fraud is the most problematic category, where almost all attacks succeeded against all models.",
    297       "evidence": "Table 2 shows the Charity column averaging 99.5% vulnerability across all models, with most models at 100%.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Media literacy skills in LLMs do not correlate with model size — smaller models within a family are often less vulnerable.",
    302       "evidence": "Section 4 discusses this finding with reference to Table 1: Llama 3.1 8b (78.3%) is less vulnerable than Llama 3.1 405b (95.0%); Claude Sonnet 4 (61.7%) is less vulnerable than Claude Opus 4 (65.0%).",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "Epistemological guidance in system prompts improves model robustness against attacks by content.",
    307       "evidence": "Appendix D (Table 3) shows Grok 3 baseline vulnerability drops from 91.7% with a generic prompt to 65.0% with the official Grok prompt containing epistemological warnings. However, the word 'significantly' is used without statistical testing, and this is tested on only one model.",
    308       "supported": "weak"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval", "theoretical"],
    312   "key_findings": "This position paper introduces 'attacks by content' — indirect data attacks where adversaries subvert AI agents through biased, misleading, or false information rather than injected instructions. Experiments across 20 contemporary LLMs show high vulnerability rates (61.7%-98.3% baseline), with charity fraud being the most effective attack category (99.5% average success). Automated fact-checking and source warnings are proposed as defenses, showing significant reductions when combined (e.g., reducing vulnerability from 90% to 6.7% for GPT-4.1). The paper also finds that vulnerability does not decrease with model scale within model families.",
    313   "red_flags": [
    314     {
    315       "flag": "Tiny evaluation dataset",
    316       "detail": "Only 60 synthetic scenarios are used (10 per area of concern). With such small sample sizes per category, individual scenario design choices could heavily influence category-level and overall vulnerability rates. No justification for why 60 scenarios suffice."
    317     },
    318     {
    319       "flag": "No uncertainty quantification",
    320       "detail": "All results are point estimates with no confidence intervals, error bars, or variance measures. The paper uses the word 'significantly' (Appendix D) without any statistical test. With 10 scenarios per category, even small differences in vulnerability rates may not be statistically meaningful."
    321     },
    322     {
    323       "flag": "Synthetic scenarios only",
    324       "detail": "All 60 attack scenarios were generated by Claude 4 Opus and manually edited. No real-world attacks or naturalistic misinformation were tested. The generalizability of vulnerability rates to real attack content is unknown."
    325     },
    326     {
    327       "flag": "Simplistic agent design",
    328       "detail": "The experimental 'agent' is a single-prompt, single-response system asked to make a yes/no decision (Figures 5 and 6). Real-world agents have multi-step reasoning, tool use, and memory. The vulnerability rates may not transfer to more sophisticated agent architectures."
    329     },
    330     {
    331       "flag": "No false positive measurement",
    332       "detail": "The paper measures only vulnerability rate (passing on malicious content) but does not measure whether the defenses also cause the agent to reject legitimate information. The utility cost of the proposed defenses is not assessed."
    333     },
    334     {
    335       "flag": "Missing hyperparameters",
    336       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 20 models tested. Different temperature settings could substantially change vulnerability rates."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    342       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    343       "year": 2023,
    344       "relevance": "Foundational work on indirect prompt injection attacks against LLM-based applications, directly related to the survey's scope on AI security."
    345     },
    346     {
    347       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    348       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    349       "year": 2024,
    350       "relevance": "Benchmark for evaluating prompt injection attacks and defenses in LLM agents, directly relevant to agent security evaluation methodology."
    351     },
    352     {
    353       "title": "Defeating prompt injections by design",
    354       "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini"],
    355       "year": 2025,
    356       "arxiv_id": "2503.18813",
    357       "relevance": "Proposes architectural defenses against prompt injection, relevant to the survey's coverage of LLM safety and defense mechanisms."
    358     },
    359     {
    360       "title": "Can llms be scammed? a baseline measurement study",
    361       "authors": ["Udari Madhushani Sehwag", "Kelly Patel", "Francesca Mosca", "Vineeth Ravi", "Jessica Staddon"],
    362       "year": 2024,
    363       "arxiv_id": "2410.13893",
    364       "relevance": "Baseline study on LLM vulnerability to scams, directly relevant to measuring AI agent security and robustness."
    365     },
    366     {
    367       "title": "How johnny can persuade LLMs to jailbreak them: Rethinking persuasion to challenge AI safety by humanizing LLMs",
    368       "authors": ["Yi Zeng", "Hongpeng Lin", "Jingwen Zhang", "Diyi Yang", "Ruoxi Jia", "Weiyan Shi"],
    369       "year": 2024,
    370       "relevance": "Studies LLM susceptibility to persuasion-based jailbreaks, relevant to understanding data-based (rather than instruction-based) attacks on AI systems."
    371     },
    372     {
    373       "title": "Teaching models to balance resisting and accepting persuasion",
    374       "authors": ["Elias Stengel-Eskin", "Peter Hase", "Mohit Bansal"],
    375       "year": 2025,
    376       "arxiv_id": "2410.14596",
    377       "relevance": "Addresses the challenge of models needing to accept beneficial persuasion while resisting harmful persuasion, directly relevant to agent defense."
    378     },
    379     {
    380       "title": "Aligning llms to be robust against prompt injection",
    381       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    382       "year": 2024,
    383       "arxiv_id": "2410.05451",
    384       "relevance": "Proposes alignment-based defenses against prompt injection, relevant to AI safety and robustness research."
    385     },
    386     {
    387       "title": "ReAct: Synergizing reasoning and acting in language models",
    388       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"],
    389       "year": 2023,
    390       "relevance": "Foundational work on reasoning and acting agents that are the subject of the attacks studied in this paper."
    391     },
    392     {
    393       "title": "The earth is flat because...: Investigating LLMs' belief towards misinformation via persuasive conversation",
    394       "authors": ["Rongwu Xu", "Brian Lin", "Shujian Yang"],
    395       "year": 2024,
    396       "relevance": "Studies LLM susceptibility to misinformation through persuasive conversation, directly relevant to understanding content-based attacks."
    397     },
    398     {
    399       "title": "Astute RAG: Overcoming imperfect retrieval augmentation and knowledge conflicts for large language models",
    400       "authors": ["Fei Wang", "Xingchen Wan", "Ruoxi Sun", "Jiefeng Chen", "Sercan Ö. Arık"],
    401       "year": 2024,
    402       "arxiv_id": "2410.07176",
    403       "relevance": "Addresses knowledge conflicts in retrieval-augmented generation, relevant to defending against content manipulation in RAG systems."
    404     },
    405     {
    406       "title": "Certifiably robust RAG against retrieval corruption",
    407       "authors": ["Chong Xiang", "Tong Wu", "Zexuan Zhong", "David Wagner", "Danqi Chen", "Prateek Mittal"],
    408       "year": 2024,
    409       "arxiv_id": "2405.15556",
    410       "relevance": "Proposes certifiable robustness for RAG systems against retrieval corruption, directly relevant to defending agents against content manipulation."
    411     },
    412     {
    413       "title": "Ignore previous prompt: Attack techniques for language models",
    414       "authors": ["Fábio Perez", "Ian Ribeiro"],
    415       "year": 2022,
    416       "relevance": "Early work on prompt injection attacks, provides foundational taxonomy that the current paper extends with the content-attack dimension."
    417     }
    418   ]
    419 }

Impressum · Datenschutz