scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20437B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "AART: AI-Assisted Red-Teaming with Diverse Data Generation for New LLM-powered Applications",
      6     "authors": [
      7       "Bhaktipriya Radharapu",
      8       "Kevin Robinson",
      9       "Lora Aroyo",
     10       "Preethi Lahoti"
     11     ],
     12     "year": 2023,
     13     "venue": "Conference on Empirical Methods in Natural Language Processing",
     14     "arxiv_id": "2311.08592",
     15     "doi": "10.48550/arXiv.2311.08592"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The abstract claims AART 'reduces human effort significantly' and shows 'promising results,' but effort reduction is never quantified and evaluation is limited to one hypothetical scenario with keyword metrics the paper itself acknowledges underestimate actual coverage.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper claims the staged pipeline 'provides granular customization and control' over alternatives and that AART 'reduces human effort significantly,' but no controlled experiment compares effort or outcomes against a counterfactual; these are asserted rather than demonstrated.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "AART is demonstrated on a single hypothetical scenario (dangerous activities, English, global text generation product) yet framed as broadly applicable to 'new LLM-powered applications' without bounding where results may not hold.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not consider whether any structured prompt generation would produce similar keyword coverage gains, or whether the keyword metric systematically favors parameterized generation approaches by construction.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly acknowledges in Section 5 that keyword-based evaluation 'is under-estimating the presence of the concepts that we care about,' distinguishing the proxy metric from the actual goal of adversarial effectiveness.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5 is a dedicated 'Limitations' section covering LLM bias, human expertise requirements for long-tail cases, computational expense, definition ambiguity, and keyword evaluation inadequacy.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Limitations mention LLM bias and keyword underestimation but threats to the evaluation's internal validity are not discussed — no inter-rater reliability for qualitative analysis (n=120), no discussion of selection effects in comparison datasets chosen.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what AART results do NOT generalize to; it acknowledges long-tail cases need humans but does not bound which application domains, harm types, or languages the demonstrated effectiveness applies to.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure statement appears in the paper; all authors are from Google Research and use Google's PaLM API, but this relationship is not disclosed as a potential conflict of interest.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are listed as being from 'Google Research' in the author line, making their institutional affiliation clear.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Google Research employees are evaluating and promoting AART, which is built on Google's PaLM API; the organization has a direct commercial interest in positive evaluation of its own technology.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "'Diversity' is the central claim but never precisely defined — the paper acknowledges in ethical considerations it has 'many facets beyond topical diversity.' 'Coverage,' 'adversarial,' and 'quality' are similarly used without operational definitions.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states three explicit contribution bullets in the introduction: the AART method, demonstration on a hypothetical application, and quantitative/qualitative comparison against existing approaches.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 explicitly situates AART relative to human red-teaming, automated red-teaming (Perez 2022), synthetic safety data generation, and harm taxonomies, explaining how AART differs from and builds on each strand of prior work.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The paper does not argue why keyword presence validly measures adversarial testing effectiveness; it presents keyword metrics without establishing the link between topical keyword coverage and actual safety evaluation quality.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "No difficulty distribution is characterized; the paper discusses topical diversity but makes no attempt to measure or tier whether generated prompts are easy, medium, or hard for safety classifiers or LLMs to handle.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No ceiling or floor effects are checked; the paper does not evaluate whether existing safety filters uniformly pass or fail on AART-generated prompts, which would indicate whether the benchmark discriminates effectively.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No human performance baseline is included; existing human red-teaming datasets are used as comparison corpora for keyword coverage, but no human is tested against AART prompts to establish difficulty or validity of the evaluation set.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Keyword-based presence rate is used as the primary quantitative metric without justification for why this is the appropriate measure; the paper itself acknowledges it 'under-estimates' actual concept coverage.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No contamination resistance measures are discussed; PaLM (an instruction-tuned LLM) generates the adversarial prompts without addressing whether the model's training data already contains similar content, creating potential circularity.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "Temporal robustness is not discussed; the paper does not address whether AART-generated benchmarks will remain effective as LLMs improve their safety training or as novel jailbreak patterns emerge that differ from the parameterized recipe outputs.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "The paper discusses several benchmark failure modes including 'how-to' over-sampling (5% of queries), task format under-representation (13 formats represented only once), LLM generation bias, and the ambiguity of distinguishing adversarial from innocuous prompts (Appendix C).",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "While prompts are provided in appendices and a dataset release is promised, no runnable code is provided; reproducing results requires access to Google's PaLM API (not freely available), making independent baseline replication infeasible.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No data card or formal documentation is included; while pipeline steps are described, collection methodology is only partially documented and preprocessing details (e.g., the 144 discarded JSON lines, filtering criteria) are insufficiently specified for independent replication.",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The paper states the dataset 'will be made available' on GitHub but provides no license information, access terms, or conditions of use within the paper itself.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "While intended use (adversarial testing of LLM applications) is clear, the paper does not specify what should NOT be concluded from benchmark results or acknowledge misuse risks from releasing a large curated set of harmful prompt templates.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "AART generates adversarial datasets with substantially higher topical diversity than existing human-created datasets",
    202       "evidence": "Table 2 shows keyword presence rates: AART 0.384/0.148/0.410 (policy/format/region) vs. best competitor Perez adaptation at 0.210/0.009/0.000 and human datasets ranging 0.008–0.032",
    203       "supported": "moderate"
    204     },
    205     {
    206       "claim": "AART reduces human effort significantly compared to manual red-teaming",
    207       "evidence": "Asserted throughout the paper but no measurement of time, cost, or effort is provided for either AART or manual alternatives",
    208       "supported": "unsupported"
    209     },
    210     {
    211       "claim": "92.5% of AART-generated prompts are of good quality for adversarial testing",
    212       "evidence": "Qualitative analysis of n=120 prompts from the demonstration scenario; no inter-rater reliability reported, no annotator count or guidelines disclosed",
    213       "supported": "weak"
    214     },
    215     {
    216       "claim": "The 4-step pipeline is reusable and customizable for different application contexts",
    217       "evidence": "Appendix E shows extensions and alternative prompts, but all demonstrations remain within the same dangerous-activities domain; no second application context is actually tested",
    218       "supported": "weak"
    219     },
    220     {
    221       "claim": "AART 'enabled launching several products with improved safety measures'",
    222       "evidence": "Stated in the conclusion with no supporting data, case studies, metrics, or product names provided",
    223       "supported": "unsupported"
    224     }
    225   ],
    226   "methodology_tags": [
    227     "benchmark-eval",
    228     "case-study"
    229   ],
    230   "key_findings": "AART proposes a 4-step AI-assisted pipeline (Problem Definition, Problem Scoping, Query Generation, Review) that parameterizes policy concepts, task formats, and geographic regions to generate adversarial evaluation datasets for LLM safety testing. Keyword-based evaluation shows AART achieves substantially higher coverage across all three parameterized dimensions compared to four human red-teaming datasets and an adaptation of Perez 2022's automated approach. Qualitative analysis of 120 samples found 92.5% suitable for adversarial testing, though the paper acknowledges keyword metrics underestimate true concept coverage and the entire evaluation is limited to a single hypothetical dangerous-activities scenario built on Google's PaLM API.",
    231   "red_flags": [
    232     {
    233       "flag": "Evaluator-funder conflict undisclosed",
    234       "detail": "All four authors are Google Research employees evaluating a pipeline built on Google's PaLM API, with no competing interests disclosure and no independent replication."
    235     },
    236     {
    237       "flag": "Keyword metric favors structured generation by construction",
    238       "detail": "AART explicitly parameterizes policy concepts, task formats, and geographic regions, so a keyword-based metric measuring presence of these exact terms almost guarantees AART outperforms datasets created without this explicit parameterization — the evaluation is not neutral."
    239     },
    240     {
    241       "flag": "Single demonstration scenario, broad generalization",
    242       "detail": "All empirical results come from one hypothetical scenario (dangerous activities, English, global user base); no additional application domains are tested, undermining broad claims about applicability to 'new LLM-powered applications.'"
    243     },
    244     {
    245       "flag": "No inter-rater reliability for qualitative evaluation",
    246       "detail": "The 92.5% quality figure comes from qualitative analysis of n=120 prompts with no reported inter-rater agreement, annotator count, annotation guidelines, or kappa score."
    247     },
    248     {
    249       "flag": "No safety classifier evaluation",
    250       "detail": "The paper never tests whether AART-generated prompts actually elicit unsafe responses from any deployed system; effectiveness as adversarial inputs is assumed, not measured."
    251     }
    252   ],
    253   "cited_papers": [
    254     {
    255       "title": "Red teaming language models with language models",
    256       "relevance": "Direct comparison method; AART adapts and outperforms Perez 2022's instruction-based automated red-teaming on keyword coverage metrics"
    257     },
    258     {
    259       "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned",
    260       "relevance": "Anthropic human red-teaming dataset used as baseline comparison; establishes context for scale and limitations of human approaches"
    261     },
    262     {
    263       "title": "RealToxicityPrompts: Evaluating neural toxic degeneration in language models",
    264       "relevance": "Mined adversarial dataset used as comparison baseline; represents alternative approach to adversarial data collection"
    265     },
    266     {
    267       "title": "Bot-adversarial dialogue for safe conversational agents",
    268       "relevance": "BAD dataset used as human red-teaming comparison baseline in evaluation"
    269     },
    270     {
    271       "title": "Ethical and social risks of harm from language models",
    272       "relevance": "Provides harm taxonomy framework motivating systematic adversarial testing and informing policy concept categories"
    273     },
    274     {
    275       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    276       "relevance": "Foundational technique adapted in AART's Query Generation step for consistency checking via CoT-style explanation generation"
    277     },
    278     {
    279       "title": "ToxiGen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection",
    280       "relevance": "Related synthetic safety data generation approach that AART builds upon for automated adversarial content creation"
    281     }
    282   ],
    283   "engagement_factors": {
    284     "practical_relevance": {
    285       "score": 3,
    286       "justification": "Directly addresses a real practitioner need — automated adversarial testing of LLM applications — with a concrete, adaptable pipeline and appendix of reusable prompt templates."
    287     },
    288     "surprise_contrarian": {
    289       "score": 1,
    290       "justification": "Automated red-teaming using LLMs is the expected direction of the field; no finding challenges conventional wisdom or produces a surprising result."
    291     },
    292     "fear_safety": {
    293       "score": 2,
    294       "justification": "Directly addresses AI safety risks and demonstrates gaps in existing evaluation datasets, reinforcing how difficult it is to catch harmful LLM outputs with human-only red-teaming."
    295     },
    296     "drama_conflict": {
    297       "score": 1,
    298       "justification": "No significant controversy; paper positions itself as complementary to existing approaches and is careful not to challenge competitors directly."
    299     },
    300     "demo_ability": {
    301       "score": 2,
    302       "justification": "Dataset released on GitHub and pipeline recipes are documented in appendix, but reproduction requires access to Google PaLM API which limits hands-on replication."
    303     },
    304     "brand_recognition": {
    305       "score": 3,
    306       "justification": "All authors from Google Research; paper addresses safety testing of frontier LLMs in a high-visibility venue (EMNLP)."
    307     }
    308   },
    309   "hn_data": {
    310     "threads": [
    311       {
    312         "hn_id": "45939036",
    313         "title": "TiDAR: Think in Diffusion, Talk in Autoregression",
    314         "points": 130,
    315         "comments": 22,
    316         "url": "https://news.ycombinator.com/item?id=45939036",
    317         "created_at": "2025-11-15T17:32:35Z"
    318       },
    319       {
    320         "hn_id": "37989614",
    321         "title": "Embarrassingly Simple Text Watermarks",
    322         "points": 86,
    323         "comments": 50,
    324         "url": "https://news.ycombinator.com/item?id=37989614",
    325         "created_at": "2023-10-23T18:27:48Z"
    326       },
    327       {
    328         "hn_id": "45935410",
    329         "title": "Autoregressive or Diffusion Language Models, Why Choose?",
    330         "points": 5,
    331         "comments": 0,
    332         "url": "https://news.ycombinator.com/item?id=45935410",
    333         "created_at": "2025-11-15T06:04:49Z"
    334       },
    335       {
    336         "hn_id": "34517931",
    337         "title": "The Risk-Taking Software Engineer: A Framed Portrait",
    338         "points": 4,
    339         "comments": 0,
    340         "url": "https://news.ycombinator.com/item?id=34517931",
    341         "created_at": "2023-01-25T13:22:03Z"
    342       },
    343       {
    344         "hn_id": "38747811",
    345         "title": "Evaluating ChatGPT for Question Answering and Comparison with Existing Models",
    346         "points": 3,
    347         "comments": 0,
    348         "url": "https://news.ycombinator.com/item?id=38747811",
    349         "created_at": "2023-12-23T20:21:42Z"
    350       },
    351       {
    352         "hn_id": "37996166",
    353         "title": "Image Cropping Under Design Constraints",
    354         "points": 3,
    355         "comments": 0,
    356         "url": "https://news.ycombinator.com/item?id=37996166",
    357         "created_at": "2023-10-24T08:20:56Z"
    358       },
    359       {
    360         "hn_id": "38677019",
    361         "title": "Limits to the Energy Efficiency of CMOS Microprocessors",
    362         "points": 2,
    363         "comments": 1,
    364         "url": "https://news.ycombinator.com/item?id=38677019",
    365         "created_at": "2023-12-17T22:15:38Z"
    366       },
    367       {
    368         "hn_id": "46151267",
    369         "title": "Generative Graph Vocabularies for Robust Graph Foundation Models Fine-Tuning",
    370         "points": 1,
    371         "comments": 0,
    372         "url": "https://news.ycombinator.com/item?id=46151267",
    373         "created_at": "2025-12-04T18:46:47Z"
    374       }
    375     ],
    376     "top_points": 130,
    377     "total_points": 234,
    378     "total_comments": 73
    379   }
    380 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs