ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18689B)


      1 {
      2   "paper": {
      3     "title": "From Firewalls to Frontiers: AI Red-Teaming is a Domain-Specific Evolution of Cyber Red-Teaming",
      4     "authors": ["Anusha Sinha", "Keltin Grimes", "James Lucassen", "Michael Feffer", "Nathan VanHoudnos", "Zhiwei Steven Wu", "Hoda Heidari"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2509.11398",
      8     "doi": "10.48550/arXiv.2509.11398"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical", "qualitative"],
     13   "key_findings": "The paper argues that AI red-teaming should be recognized as a domain-specific evolution of cyber red-teaming, not a separate discipline. It identifies three key areas where AI Red Teams can benefit from cyber practices: structured threat modeling for adversary emulation, mutual accountability through rules of engagement and CVD protocols, and mature open-source tooling ecosystems. The paper also details how Cyber Red Teams must evolve to handle AI-specific risks (adversarial examples, prompt injection, socio-technical harms), new failure modes (emergence, opacity), and unpatchable AI vulnerabilities.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": false,
     18         "answer": false,
     19         "justification": "This is a position/opinion paper with no code artifacts to release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper builds on a systematic review [88] and presents Figure 1 with coverage data across 99 AI and 69 Cyber Red Team papers. The underlying dataset is not released."
     25       },
     26       "environment_specified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No computational experiments were run; this is a position paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "No experiments to reproduce; this is a position paper."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No statistical analyses are performed. The paper is a position/argument piece."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative statistical claims requiring significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative experiments are conducted."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No sample-based analysis is performed by this paper."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs to report variance across."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares AI Red Teams against Cyber Red Teams across red-teaming lifecycle stages (Figure 1), drawing on the systematic review by [88]. This comparison serves as the baseline framing."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The comparison draws on recent systematic review [88] (Sinha et al. 2025) covering 99 AI and 69 Cyber Red Team papers, which is contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate; this is a position paper."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No quantitative evaluation metrics; this is a position paper."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate; this is a position paper."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No datasets or test sets; this is a position paper."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 1 provides a per-stage breakdown of red-teaming coverage across AI vs. Cyber Red Team literatures. Figure 2 provides a taxonomy of risk categories."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses specific failure cases: AI Red Teams' lack of pre-engagement and vulnerability analysis stages (Figure 1), jailbreak research criticized for ignoring alternative content sources (Section 4.1), and the Nasr et al. training data extraction vulnerability that Google failed to patch after OpenAI disclosure (Section 4.2)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 2.1 presents an alternative view opposing the paper's position and engages with it substantively, acknowledging arguments that boxing AI red-teaming into cybersecurity could stifle effectiveness."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims AI systems can be more effectively red-teamed by viewing AI red-teaming as an evolution of cyber red-teaming. The body provides structured arguments across Sections 2-4 with specific examples and references supporting this position."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes position/argument claims ('should be recognized as', 'will best position') rather than empirical causal claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is explicit about its scope as a position paper and frames its argument in terms of recommendations rather than universal claims. Section 2.1 directly addresses the alternative view."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 2.1 'Alternative View' explicitly presents the counterargument that AI and software systems are different in kind and should have separate red-teaming, then provides a rebuttal."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "No measurements or proxies; this is a theoretical position paper."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No models are used; this is a position paper."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No experiments with hyperparameters."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper references data from systematic review [88] for Figure 1 but does not describe how the 99 AI and 69 Cyber papers were selected or processed for the stage-coverage analysis."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The paper has an 'Alternative View' subsection (2.1) but no explicit discussion of the limitations of its own analysis or argument."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The alternative view section addresses counterarguments to the position but not methodological limitations of the paper's own analysis."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what its argument does NOT cover. For example, it does not discuss limitations of the literature reviewed, geographic scope, or which types of AI systems the argument may not apply to."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Figure 1 presents aggregate data from reviewing 99 AI and 69 Cyber Red Team papers. The underlying paper list and coding data are not available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper references [88] for the data behind Figure 1 but does not describe the collection procedure in this paper itself."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; data comes from literature review in a prior publication."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how the coverage data in Figure 1 was derived from the reviewed papers."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 6 Acknowledgments: 'This material is based upon work funded and supported by the Department of Defense under Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the operation of the Software Engineering Institute.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Software Engineering Institute at CMU, CMU, and one independent researcher."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Department of Defense funds CMU's Software Engineering Institute as a FFRDC. While DoD has an interest in red-teaming practices, it does not have a direct financial stake in whether AI red-teaming is framed as an evolution of cyber red-teaming."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model is evaluated on any benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No pre-trained model is evaluated on any benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated on any benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Position paper; no computational method with costs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Position paper; no compute-intensive work."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "AI red-teaming should be recognized as a domain-specific evolution of cyber red-teaming, not a separate discipline.",
    296       "evidence": "The paper presents a structured argument across Sections 2-4, supported by Figure 1 showing complementary coverage gaps between AI and Cyber Red Teams from a systematic review of 99 AI and 69 Cyber red-teaming papers [88].",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "AI Red Teams report covering fewer red-teaming stages than Cyber Red Teams, with no AI papers referencing pre-engagement, scanning, vulnerability analysis, or cyber exploitation stages.",
    301       "evidence": "Figure 1 shows the distribution of red-team stages across AI and Cyber Red Team literature based on systematic review [88]. Gold bars (Cyber) show broader coverage while green bars (AI) show gaps in early stages.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Many well-known AI vulnerabilities lack known fixes, making them 'unpatchable' in ways that require different disclosure and mitigation strategies.",
    306       "evidence": "Section 3.3 cites adversarial examples on image classifiers [93] studied for over a decade with little robustness progress [22], and draws parallels to Spectre [49] and BGP insecurity [74] in cybersecurity.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "AI Red Teams often do not have explicit accountability mechanisms in place, unlike Cyber Red Teams.",
    311       "evidence": "Section 4.2 cites [55] on lack of legal protections for good-faith AI security research, [88] on failure to follow responsible disclosure, and the Nasr et al. [63] case where a vulnerability disclosed to OpenAI was later present in Google's models.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "red_flags": [
    316     {
    317       "flag": "Heavy self-citation",
    318       "detail": "The paper's central evidence (Figure 1, coverage gap analysis) comes from [88], a prior publication by the same lead authors (Sinha, Lucassen, Grimes, Feffer, VanHoudnos). This paper effectively extends and advocates based on their own prior work without independent validation."
    319     },
    320     {
    321       "flag": "No limitations section",
    322       "detail": "For a position paper making broad claims about how two fields should merge, there is no discussion of where this framing might be inappropriate, which types of AI systems might not fit, or limitations of the underlying systematic review data."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Red-teaming for generative ai: Silver bullet or security theater?",
    328       "authors": ["Michael Feffer", "Anusha Sinha", "Wesley H Deng", "Zachary C Lipton", "Hoda Heidari"],
    329       "year": 2024,
    330       "relevance": "Directly relevant survey examining whether AI red-teaming is substantive security practice or performative."
    331     },
    332     {
    333       "title": "What can genai red-teaming learn from cyber red-teaming?",
    334       "authors": ["Anusha Sinha", "James Lucassen", "Keltin Grimes", "Michael Feffer", "Nathan VanHoudnos", "Ellie Soto", "Hoda Heidari"],
    335       "year": 2025,
    336       "relevance": "The systematic review underlying this position paper's central evidence, comparing AI and Cyber Red Team literature coverage."
    337     },
    338     {
    339       "title": "Lessons from red teaming 100 generative ai products",
    340       "authors": ["Blake Bullwinkel", "Amanda Minnich"],
    341       "year": 2025,
    342       "arxiv_id": "2501.07238",
    343       "relevance": "Large-scale practical AI red-teaming study relevant to understanding current AI red-teaming practices."
    344     },
    345     {
    346       "title": "Position: A safe harbor for ai evaluation and red teaming",
    347       "authors": ["Shayne Longpre", "Sayash Kapoor", "Kevin Klyman"],
    348       "year": 2024,
    349       "relevance": "Addresses legal protections for AI red-teaming and evaluation, a key gap identified in this paper."
    350     },
    351     {
    352       "title": "In-house evaluation is not enough: Towards robust third-party flaw disclosure for general-purpose ai",
    353       "authors": ["Shayne Longpre", "Kevin Klyman", "Ruth E Appel"],
    354       "year": 2025,
    355       "arxiv_id": "2503.16861",
    356       "relevance": "Proposes CVD frameworks for AI systems, directly relevant to the disclosure gap this paper identifies."
    357     },
    358     {
    359       "title": "\"Real attackers don't compute gradients\": bridging the gap between adversarial ml research and practice",
    360       "authors": ["Giovanni Apruzzese"],
    361       "year": 2023,
    362       "relevance": "Critiques gap between adversarial ML research and practical threat modeling, supporting this paper's argument for realistic adversary emulation."
    363     },
    364     {
    365       "title": "Alignment faking in large language models",
    366       "authors": ["Ryan Greenblatt", "Carson Denison"],
    367       "year": 2024,
    368       "arxiv_id": "2412.14093",
    369       "relevance": "Example of AI misalignment research relevant to the socio-technical risk category this paper discusses."
    370     },
    371     {
    372       "title": "Scalable extraction of training data from aligned, production language models",
    373       "authors": ["Milad Nasr", "Javier Rando", "Nicholas Carlini"],
    374       "year": 2025,
    375       "relevance": "Key case study cited for CVD failures in AI — vulnerability disclosed to OpenAI but later found in Google models."
    376     },
    377     {
    378       "title": "An approach to technical agi safety and security",
    379       "authors": ["Rohin Shah", "Alex Irpan"],
    380       "year": 2025,
    381       "arxiv_id": "2504.01849",
    382       "relevance": "Technical AGI safety framework relevant to understanding AI-specific risks discussed in this paper."
    383     },
    384     {
    385       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    386       "authors": ["Mantas Mazeika", "Long Phan"],
    387       "year": 2024,
    388       "arxiv_id": "2402.04249",
    389       "relevance": "Standardized AI red-teaming evaluation framework, relevant to the tooling maturity gap this paper identifies."
    390     }
    391   ]
    392 }

Impressum · Datenschutz