ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (20235B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "position",
      4   "paper": {
      5     "title": "From Firewalls to Frontiers: AI Red-Teaming is a Domain-Specific Evolution of Cyber Red-Teaming",
      6     "authors": [
      7       "Anusha Sinha",
      8       "Keltin Grimes",
      9       "James Lucassen",
     10       "Michael Feffer",
     11       "Nathan Vanhoudnos"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2509.11398",
     16     "doi": "10.48550/arXiv.2509.11398"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's core claims — that AI red-teaming lacks structure/tooling and that cyber red-teaming provides a mature framework — are substantiated throughout the paper with citations to a systematic review [88] and specific examples (RoEs, CVD, threat modeling frameworks).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper repeatedly asserts that adopting the cyber framing 'will allow' AI Red Teams to 'better evaluate' systems, but these are prescriptive arguments without empirical validation or a study design that could support causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Broad claims about 'AI red-teaming' and 'Cyber Red Teams' as unified communities rely almost entirely on one systematic review [88] co-authored by overlapping authors; no bounds are placed on the types of AI systems, organizational contexts, or deployment environments where conclusions apply.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 2.1 explicitly addresses the strongest alternative view — that AI and software systems are different in kind and therefore require separate red-teaming ecosystems — and engages with specific proponents and their arguments.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "This is a position paper with no empirical measurements; no proxy outcomes are used.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion only calls for future work without acknowledging limits of the current argument.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed; the paper does not acknowledge that its primary evidence source [88] was authored by overlapping authors, nor that historical analogies (Internet, cloud, IoT) may not hold for AI.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not state what types of AI systems, deployment contexts, or organizational structures the argument does NOT apply to; the recommendations are presented as universally applicable.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 6 explicitly discloses DoD funding under Contract No. FA8702-15-D-0002 for operation of the Carnegie Mellon University Software Engineering Institute.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are disclosed in the paper header: CMU Software Engineering Institute, CMU, and one independent author.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The DoD funder has a general interest in improved security practices but no specific financial stake in whether AI red-teaming merges with cyber red-teaming as an institutional or commercial matter.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement appears; there is no declaration regarding patents, equity, or consulting arrangements, only boilerplate copyright and distribution language.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper defines 'red team' in the abstract and contextually clarifies 'AI red-teaming,' 'cyber red-teaming,' 'adversary emulation,' 'RoEs,' and 'CVD' throughout; the core term 'domain-specific evolution' is used descriptively but the paper clearly explains what it means through contrast.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is explicitly stated: a position argument that AI red-teaming is a domain-specific evolution of cyber red-teaming, with concrete recommendations for both communities; the paper structure mirrors this with sections for each direction of benefit.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper builds substantially on the systematic review [88] and cites 107 references across both communities; it discusses how its position differs from the 'separate ecosystems' view and how it builds on existing frameworks like MITRE ATT&CK and CVD processes.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "position": {
    120       "argument_quality": {
    121         "argument_internally_consistent": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues bi-directionally and consistently: AI teams gain structure/accountability from cyber practices, cyber teams gain AI-domain expertise, and both conclusions are supported by the same framing without contradiction.",
    125           "source": "haiku"
    126         },
    127         "counterarguments_addressed": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Section 2.1 directly addresses the strongest opposing view — that AI and software systems differ in kind and need separate institutions — and names specific proponents [56, 14, 70] before rebutting each element.",
    131           "source": "haiku"
    132         },
    133         "analogies_appropriate": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The analogies to Internet adoption, cloud, and IoT as prior technological shifts that cyber red-teaming absorbed are contextually appropriate; the Spectre/BGP analogy for unpatchable vulnerabilities is precise and well-sourced.",
    137           "source": "haiku"
    138         },
    139         "prescriptions_proportional": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Recommendations are specific and narrow (define threat models, establish RoEs, build open-source tooling) rather than sweeping policy mandates; they are proportional to the argumentative evidence presented.",
    143           "source": "haiku"
    144         },
    145         "evidence_for_claims_cited": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Factual claims are extensively cited across 107 references; specific assertions such as 'AI red-teaming suffers from a lack of formalized procedures' cite [55, 88] and claims about adversarial examples cite the original Szegedy et al. [93] and RobustBench [22].",
    149           "source": "haiku"
    150         },
    151         "alternatives_discussed": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Section 2.1 presents and directly rebuts the alternative view of AI-specific separate institutions; the paper also discusses that cyber red-teaming alone (without AI expertise augmentation) is insufficient, showing awareness of partial alternatives.",
    155           "source": "haiku"
    156         },
    157         "historical_context_accurate": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Historical references — Spectre vulnerabilities, BGP insecurity, Morris worm, ImageNet, AlphaGo, ALVINN — are accurate and well-cited with primary sources.",
    161           "source": "haiku"
    162         }
    163       },
    164       "clarity_and_scope": {
    165         "key_terms_defined_precisely": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The central thesis phrase 'domain-specific evolution' is never precisely defined; terms like 'adversary emulation' and 'threat modeling' are used without formal definitions, relying on reader familiarity with cybersecurity conventions.",
    169           "source": "haiku"
    170         },
    171         "engages_with_existing_literature": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "The paper engages substantively with [88] (the primary systematic review), AI safety literature [83, 34, 35], jailbreak research [59, 73], responsible disclosure frameworks [55, 56, 44], and red-teaming practice literature [28, 15, 2]; it compares and builds on these, not merely lists them.",
    175           "source": "haiku"
    176         },
    177         "intended_audience_clear": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The paper addresses both AI Red Teams and Cyber Red Teams as practitioners, and also researchers and policymakers; this is made explicit in the introduction and structurally reinforced by separate sections for each audience.",
    181           "source": "haiku"
    182         },
    183         "assumptions_stated": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The key assumption that cyber red-teaming's historical absorption of new technologies is a valid analogy for AI is asserted but not examined; the assumption that AI vulnerabilities are fundamentally addressable within the cyber framework (rather than requiring distinct institutions) is treated as given rather than argued.",
    187           "source": "haiku"
    188         },
    189         "scope_of_applicability_discussed": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The paper does not discuss where the argument does not apply — e.g., whether the merger thesis holds for research-only AI red-teaming, for safety evaluations without a security framing, or for non-enterprise AI deployments.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "AI Red Teams cover fewer red-teaming stages than Cyber Red Teams, missing pre-engagement, scanning, vulnerability analysis, and cyber exploitation stages entirely.",
    201       "evidence": "Figure 1 from systematic review [88] showing stage distribution across 99 AI and 69 cyber red-team papers.",
    202       "supported": "moderate"
    203     },
    204     {
    205       "claim": "No Cyber Red Team papers in the systematic review noted exploitation of an AI component.",
    206       "evidence": "Figure 1 caption and Section 1, citing [88]; the finding is from a single systematic review by overlapping authors.",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "AI vulnerabilities such as adversarial examples lack known fixes despite a decade of research.",
    211       "evidence": "RobustBench [22] cited to support minimal progress on adversarial robustness; claim is well-established in the literature.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "AI red-teaming lacks formalized procedures, adversary emulation, responsible disclosure, and mature tooling.",
    216       "evidence": "Citations [55, 88] support this; however both sources are closely related to paper authors, and [88] is a CMU SEI technical report by largely the same team.",
    217       "supported": "moderate"
    218     },
    219     {
    220       "claim": "A training data extraction vulnerability disclosed to OpenAI was later present in Google models, illustrating failure of coordinated vulnerability disclosure in AI.",
    221       "evidence": "Nasr et al. [63] cited as the primary source for this specific incident.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "Cyber red-teaming successfully absorbed previous major technological shifts (Internet, cloud, IoT) and can do the same for AI.",
    226       "evidence": "Cited by analogy using [67, 47, 57]; no empirical evidence that historical absorptions were analogous in difficulty or that AI follows the same pattern.",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "theoretical",
    232     "qualitative"
    233   ],
    234   "key_findings": "The paper argues that AI red-teaming should be understood as a domain-specific evolution of cyber red-teaming rather than a distinct discipline. AI Red Teams lack structured threat modeling, accountability mechanisms, and mature tooling that cyber red-teaming has developed over decades. Cyber Red Teams in turn lack AI-domain expertise to address AI-specific risks (adversarial examples, prompt injection, socio-technical harms) and unpatchable vulnerability classes. A merged approach would benefit both communities by combining the structural maturity of cyber red-teaming with AI-specific domain knowledge.",
    235   "red_flags": [
    236     {
    237       "flag": "Self-citing primary evidence",
    238       "detail": "The central empirical evidence (Figure 1 stage distribution, claims about AI red-teaming gaps) derives almost entirely from systematic review [88], which shares four of five authors with this position paper, creating potential confirmation bias."
    239     },
    240     {
    241       "flag": "No limitations section",
    242       "detail": "There is no dedicated limitations or scope-bounding section; the argument is presented as generally applicable without acknowledging conditions under which the merger thesis might not hold."
    243     },
    244     {
    245       "flag": "Unvalidated prescriptions",
    246       "detail": "All three sets of recommendations (structured threat modeling, accountability mechanisms, tool maturity) are proposed without empirical evidence that implementing them would improve red-teaming outcomes; no case studies or pilots are referenced."
    247     },
    248     {
    249       "flag": "Analogy-as-evidence",
    250       "detail": "The argument that cyber red-teaming absorbed Internet, cloud, and IoT shifts relies on analogical reasoning without demonstrating that AI presents comparable absorptive difficulty — the paper treats historical precedent as sufficient justification."
    251     }
    252   ],
    253   "cited_papers": [
    254     {
    255       "title": "What can GenAI red-teaming learn from cyber red-teaming?",
    256       "relevance": "Primary empirical foundation for this paper; systematic review comparing AI and cyber red-teaming literature coverage across engagement stages."
    257     },
    258     {
    259       "title": "Red-teaming for generative AI: Silver bullet or security theater?",
    260       "relevance": "Critical analysis of AI red-teaming effectiveness; argues current practices lack rigor and adversary emulation."
    261     },
    262     {
    263       "title": "A safe harbor for AI evaluation and red teaming",
    264       "relevance": "Position paper advocating for responsible disclosure frameworks and legal protections in AI red-teaming."
    265     },
    266     {
    267       "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned",
    268       "relevance": "Foundational empirical work on AI red-teaming methodology from Anthropic; establishes scaling behaviors of red-team findings."
    269     },
    270     {
    271       "title": "Lessons from red teaming 100 generative AI products",
    272       "relevance": "Large-scale practical experience report from Microsoft on generative AI red-teaming; informs gap claims."
    273     },
    274     {
    275       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    276       "relevance": "Benchmark for automated red-teaming; cited for critique that jailbreak research ignores threat model realism."
    277     },
    278     {
    279       "title": "In-house evaluation is not enough: Towards robust third-party flaw disclosure for general-purpose AI",
    280       "relevance": "Argues for CVD-equivalent processes in AI; directly supports the accountability mechanisms section."
    281     },
    282     {
    283       "title": "AI control: Improving safety despite intentional subversion",
    284       "relevance": "Referenced for insider threat modeling parallels with AI misalignment; relevant to threat modeling section."
    285     },
    286     {
    287       "title": "OpenAI's approach to external red teaming for AI models and systems",
    288       "relevance": "Describes current industry practice in AI red-teaming; cited as context for the policy and accountability discussion."
    289     }
    290   ],
    291   "engagement_factors": {
    292     "practical_relevance": {
    293       "score": 2,
    294       "justification": "Provides concrete recommendations (RoE adoption, threat actor profiles, open-source tooling) that red-team practitioners in either community could act on."
    295     },
    296     "surprise_contrarian": {
    297       "score": 1,
    298       "justification": "The merger thesis is intuitive given obvious overlap; the paper's contribution is formalizing and arguing the position rather than surfacing a surprising claim."
    299     },
    300     "fear_safety": {
    301       "score": 2,
    302       "justification": "Discusses AI misalignment, psychosocial harms, open-source model misuse risks, and AI-enabled cyberattacks as concrete threats motivating the need for better red-teaming."
    303     },
    304     "drama_conflict": {
    305       "score": 1,
    306       "justification": "There is a mild controversy in arguing against the 'AI is different in kind' camp and critiquing jailbreak research as lacking threat model realism, but the tone is collegial."
    307     },
    308     "demo_ability": {
    309       "score": 0,
    310       "justification": "No tools, datasets, or interactive artifacts are presented; purely argumentative with no demonstrable component."
    311     },
    312     "brand_recognition": {
    313       "score": 2,
    314       "justification": "Carnegie Mellon University Software Engineering Institute is a well-known and highly credible institution in both cybersecurity and AI safety research."
    315     }
    316   },
    317   "hn_data": {
    318     "threads": [
    319       {
    320         "hn_id": "44979024",
    321         "title": "Inter-APU Communication on AMD MI300A Systems via Infinity Fabric: A Deep Dive",
    322         "points": 4,
    323         "comments": 0,
    324         "url": "https://news.ycombinator.com/item?id=44979024",
    325         "created_at": "2025-08-21T22:43:45Z"
    326       },
    327       {
    328         "hn_id": "45361132",
    329         "title": "Opal: An Operator Algebra View of RLHF",
    330         "points": 2,
    331         "comments": 0,
    332         "url": "https://news.ycombinator.com/item?id=45361132",
    333         "created_at": "2025-09-24T14:42:11Z"
    334       },
    335       {
    336         "hn_id": "45260309",
    337         "title": "\"My Boyfriend Is AI\": Computational Analysis of Human-AI Companionship",
    338         "points": 2,
    339         "comments": 0,
    340         "url": "https://news.ycombinator.com/item?id=45260309",
    341         "created_at": "2025-09-16T10:15:49Z"
    342       },
    343       {
    344         "hn_id": "37649077",
    345         "title": "Lmsys-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset",
    346         "points": 2,
    347         "comments": 1,
    348         "url": "https://news.ycombinator.com/item?id=37649077",
    349         "created_at": "2023-09-25T19:16:05Z"
    350       },
    351       {
    352         "hn_id": "43537705",
    353         "title": "Cerebras Wafer-Scale Integration vs. Nvidia GPU-Based Systems for AI",
    354         "points": 2,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=43537705",
    357         "created_at": "2025-03-31T17:48:00Z"
    358       },
    359       {
    360         "hn_id": "37911895",
    361         "title": "A Large-Scale Real-World LLM Conversation Dataset",
    362         "points": 1,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=37911895",
    365         "created_at": "2023-10-17T08:04:27Z"
    366       }
    367     ],
    368     "top_points": 4,
    369     "total_points": 13,
    370     "total_comments": 1
    371   }
    372 }

Impressum · Datenschutz