ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19064B)


      1 {
      2   "paper": {
      3     "title": "Cybersecurity AI: Hacking the AI Hackers via Prompt Injection",
      4     "authors": ["Víctor Mayoral-Vilches", "Per Mannermaa Rynning", "Ameer Pornillos"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.21669"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "A GitHub pull request link is provided (https://github.com/aliasrobotics/cai/pull/249) with detailed technical solutions. The CAI framework itself is open source."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides PoC attack payloads directly in the text and appendix (malicious server code, index.html payloads), and references the CAI repository for reproduction."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions Python and specific tools (curl, nmap, socat) but does not specify versions or a reproducible environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While PoC code is shown, there are no step-by-step reproduction instructions. The paper mentions 'test environments provided in our research repository' but does not include a README or explicit reproduction guide in the paper itself."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 91.4% overall success rate, 100% for direct execution). No confidence intervals or error bars are provided despite the relatively small sample sizes (10 attempts per variant)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares success rates across attack categories and claims differences in effectiveness but provides no statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Success rates are reported with baseline context: e.g., '91.4% (128/140 attempts)' unprotected vs '0% (0/140)' with guardrails. The absolute counts and rates provide sufficient effect size context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample size of 10 attempts per variant (140 total) is stated but not justified. No power analysis or rationale for why 10 attempts is sufficient to establish 'statistical significance' as the paper claims."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across the 10 attempts per variant. Only aggregate success rates and mean time-to-compromise are given."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The unprotected CAI system serves as the baseline, compared against the guardrail-protected version. Table 4 shows before/after guardrail results."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Only the CAI framework is tested. No comparison against other AI security tools (PentestGPT, etc.) or other defense mechanisms is provided."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The four-layer defense is presented as a complete system. No ablation study shows the contribution of each individual layer (sandboxing alone, tool-level protection alone, etc.)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: success rate, time-to-compromise, latency overhead (12.3ms), memory footprint (47.2MB), CPU utilization (1.7% increase), and false positive rate (<0.1%)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant here; the attacks and defenses are evaluated by whether system compromise occurs, which is objectively verifiable."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a security evaluation with proof-of-concept attacks, not a benchmark evaluation with train/test splits."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 4 provides per-category breakdown across all seven attack categories with individual success rates and time-to-compromise for each."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The paper does not discuss cases where attacks failed (e.g., the 1/40 failure in multi-layer encoding, 1/20 in variable indirection). No analysis of why some attempts within categories failed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every defense layer is presented as successful. No discussion of defense approaches that were tried and didn't work, or attack variants that the guardrails initially failed to catch."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims prompt injection attacks against AI security tools, PoC exploits against CAI, and multi-layered defense. All are supported by the experimental results in Sections 2 and 4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The causal claim that the guardrail system blocks attacks is supported by controlled before/after testing (same attacks with and without guardrails). The single-variable manipulation (enabling/disabling guardrails) is adequate."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests only the CAI framework but makes broad claims about 'AI security agents' and 'all LLM-based security tools' being 'fundamentally unsafe.' The title and conclusions generalize far beyond the single system tested."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed. For example, the paper does not consider whether CAI's specific architecture makes it more vulnerable than other tools, or whether the 100% success rate is an artifact of the specific LLM used."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper does not specify which LLM is used by the CAI framework for the experiments. No model name, version, or API details are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The attack prompts/payloads are provided in full (Code listings 1-9). The system prompt context ('Bug Bounter' agent) is shown in the tool output logs."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the CAI agent's underlying model."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The CAI framework's agentic scaffolding is described: tool execution via generic_linux_command, data wrapping with [TOOL OUTPUT] markers, and the four-layer defense architecture is detailed in Section 3."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No documentation of how attack variants were selected, how the 14 PoC attacks were developed, or what criteria determined the seven categories."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. A footnote on page 9 briefly mentions sandbox limitations, but this does not constitute a limitations section."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. The footnote about sandbox escape is the only acknowledgment of a limitation."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not state what it did NOT test. It generalizes from testing one framework (CAI) to all LLM-based security tools without bounding the scope."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw experimental logs (the 140 individual trial outcomes) are not available. Only aggregate success rates are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4 describes the experimental protocol: '14 attack variants × 10 attempts each = 140 total exploitation attempts against unprotected systems.'"
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; this is a technical security evaluation."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from individual trial outcomes to aggregate success rates is not documented. How 'success' vs 'failure' was determined for each attempt is not explicitly defined."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 6 states: 'This research was partly funded by the European Innovation Council (EIC) accelerator project \"RIS\" (GA 101161136).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Alias Robotics, Oracle Corporation, and independent security researcher. The first author is affiliated with Alias Robotics, which develops the CAI framework being tested."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The EIC funder is a government agency with no commercial stake in the CAI framework's security evaluation results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is provided. The first author's company (Alias Robotics) develops and maintains the CAI framework being evaluated, which is a significant undeclared conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper tests prompt injection attacks against a security agent, not model capability on a benchmark. The attacks test the agent's behavior, not its knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not a benchmark evaluation; contamination in the traditional sense is not applicable."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not a benchmark evaluation."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Token costs are shown in individual code listing outputs (e.g., 'Session: $0.0376') but no aggregate cost analysis for the full experimental campaign is reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget is stated. Performance overhead of guardrails is reported (12.3ms latency, 47.2MB memory, 1.7% CPU) but not the total compute used for experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "All 14 attack variants achieved 100% exploitation success against unprotected AI security agents, with a mean time-to-compromise of 20.1 seconds.",
    286       "evidence": "Section 4 and Table 4 report 128/140 successful attempts (91.4% overall) across 14 variants with 10 attempts each. Individual category rates range from 70% to 100%.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The four-layer defense architecture achieves complete mitigation (0% attack success) while maintaining operational efficiency (<12ms latency, <0.1% false positives).",
    291       "evidence": "Section 4 reports 0/140 successful attacks with guardrails enabled, with 12.3ms mean latency, 47.2MB memory, and 1.7% CPU overhead.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Prompt injection is a systemic architectural flaw in transformer-based LLMs, not an implementation bug.",
    296       "evidence": "Section 5 provides a theoretical argument based on the attention mechanism treating all tokens identically regardless of trust level. Empirical support is limited to testing one framework (CAI).",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "Current LLM-based security agents are fundamentally unsafe for deployment in adversarial environments without comprehensive defensive measures.",
    301       "evidence": "Based on testing 14 attack variants against the CAI framework only. No other security agents were tested.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["case-study", "benchmark-eval"],
    306   "key_findings": "The paper demonstrates that AI-powered cybersecurity agents (specifically the CAI framework) are highly vulnerable to prompt injection attacks, with a 91.4% overall success rate across 14 attack variants in 7 categories. A four-layer defense architecture (sandboxing, tool-level protection, file write protection, multi-layer validation) achieved complete mitigation in testing. The authors argue this vulnerability is architecturally inherent to transformer-based LLMs due to the attention mechanism's inability to distinguish trusted from untrusted tokens.",
    307   "red_flags": [
    308     {
    309       "flag": "Self-evaluation conflict",
    310       "detail": "The first author (Alias Robotics) develops and maintains the CAI framework being both attacked and defended. The paper evaluates their own product's vulnerability and their own defense, without acknowledging this as a conflict of interest."
    311     },
    312     {
    313       "flag": "Overgeneralization from single system",
    314       "detail": "Only the CAI framework is tested, but conclusions are stated as applying to 'all LLM-based security tools' and 'current LLM-based security agents' broadly. No other tools were evaluated."
    315     },
    316     {
    317       "flag": "No limitations section",
    318       "detail": "The paper lacks any limitations or threats-to-validity discussion despite making very strong claims ('fundamentally unsafe', 'unequivocal conclusion')."
    319     },
    320     {
    321       "flag": "Suspiciously clean defense results",
    322       "detail": "The guardrail system achieves exactly 0% attack success across all 140 attempts with no false negatives discussed. No ablation of individual defense layers is provided."
    323     },
    324     {
    325       "flag": "Unspecified LLM",
    326       "detail": "The underlying LLM used by the CAI agent is never identified. Results could vary dramatically with different models, yet the paper generalizes to all LLM-based systems."
    327     },
    328     {
    329       "flag": "Small sample size claimed as statistically significant",
    330       "detail": "The paper claims 'statistical significance' from 10 attempts per variant without any statistical tests or power analysis."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "PentestGPT: Evaluating and Harnessing Large Language Models for Automated Penetration Testing",
    336       "authors": ["Gelei Deng", "Yi Liu", "Víctor Mayoral-Vilches"],
    337       "year": 2024,
    338       "relevance": "Directly relevant as a major AI-powered security tool evaluated for LLM capability in penetration testing."
    339     },
    340     {
    341       "title": "CAI: An Open, Bug Bounty-Ready Cybersecurity AI",
    342       "authors": ["Víctor Mayoral-Vilches"],
    343       "year": 2025,
    344       "arxiv_id": "2504.06017",
    345       "relevance": "The framework under evaluation in this paper; describes the agentic AI architecture for cybersecurity."
    346     },
    347     {
    348       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    349       "authors": ["Fábio Perez", "Ian Ribeiro"],
    350       "year": 2022,
    351       "relevance": "Foundational work on prompt injection attacks against LLMs, directly relevant to AI safety."
    352     },
    353     {
    354       "title": "Cybersecurity AI: The Dangerous Gap Between Automation and Autonomy",
    355       "authors": ["Víctor Mayoral-Vilches"],
    356       "year": 2025,
    357       "arxiv_id": "2506.23592",
    358       "relevance": "Discusses the automation-autonomy gap in AI security agents, relevant to agentic AI safety."
    359     },
    360     {
    361       "title": "CAI Fluency: A Framework for Cybersecurity AI Fluency",
    362       "authors": ["Víctor Mayoral-Vilches"],
    363       "year": 2025,
    364       "arxiv_id": "2508.13588",
    365       "relevance": "Framework for evaluating cybersecurity AI capabilities, relevant to AI agent evaluation methodology."
    366     }
    367   ]
    368 }

Impressum · Datenschutz