scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20900B)
      1 {
      2   "paper": {
      3     "title": "AART: AI-Assisted Red-Teaming with Diverse Data Generation for New LLM-powered Applications",
      4     "authors": [
      5       "Bhaktipriya Radharapu",
      6       "Kevin Robinson",
      7       "Lora Aroyo",
      8       "Preethi Lahoti"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv",
     12     "arxiv_id": "2311.08592"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [],
     16   "methodology_tags": [
     17     "case-study",
     18     "qualitative"
     19   ],
     20   "key_findings": "AART proposes a structured pipeline for AI-assisted generation of adversarial evaluation datasets using LLM-generated recipes across policy concepts, task formats, and geographic regions. Compared to existing human red-teaming datasets and an adaptation of Perez et al. (2022), AART achieves higher keyword coverage for policy concepts (0.384 vs 0.210), task formats (0.148 vs 0.009), and geographic regions (0.410 vs 0.000). Qualitative analysis of 120 samples found 92.5% were of good quality for adversarial testing.",
     21   "claims": [
     22     {
     23       "claim": "AART generates adversarial datasets with higher coverage of policy concepts, task formats, and geographic regions than existing datasets",
     24       "evidence": "Table 2 shows keyword presence scores: AART achieves 0.384 for policy concepts vs next best 0.210 (Perez adaptation), 0.148 for task formats vs 0.013, and 0.410 for geographic regions vs 0.027. Section 4.",
     25       "supported": "moderate"
     26     },
     27     {
     28       "claim": "92.5% of AART-generated prompts are of good quality and useful for adversarial testing",
     29       "evidence": "Qualitative evaluation of 120 sampled prompts in Section 4. Limited sample size, single evaluator not explicitly described.",
     30       "supported": "weak"
     31     },
     32     {
     33       "claim": "Policy concepts from structured generation are referenced in 99.2% of queries",
     34       "evidence": "Qualitative evaluation section states this based on manual review of 120 sampled prompts.",
     35       "supported": "weak"
     36     },
     37     {
     38       "claim": "AART enabled launching several products with improved safety measures",
     39       "evidence": "Stated in conclusion (Section 6) with no supporting data or details provided.",
     40       "supported": "unsupported"
     41     }
     42   ],
     43   "red_flags": [
     44     {
     45       "flag": "Company evaluating own product",
     46       "detail": "All authors are from Google Research. AART uses Google's PaLM API and the paper discusses enabling Google product launches. No independent evaluation."
     47     },
     48     {
     49       "flag": "Weak evaluation methodology",
     50       "detail": "Quantitative evaluation relies solely on keyword matching, which the authors acknowledge underestimates coverage. Qualitative evaluation covers only 120 of 3,269 prompts with unclear evaluator details and no inter-rater reliability."
     51     },
     52     {
     53       "flag": "No downstream effectiveness evaluation",
     54       "detail": "The paper never tests whether AART-generated adversarial prompts actually elicit unsafe model behavior. Coverage/diversity of prompts is measured but not their adversarial effectiveness."
     55     },
     56     {
     57       "flag": "Unfair baseline comparison",
     58       "detail": "Existing datasets (RealToxicityPrompts, ParlAI, BAD, Anthropic) were created for different application contexts and purposes. Comparing keyword coverage for AART's specific policy concepts against datasets not designed for those concepts inflates AART's apparent advantage."
     59     }
     60   ],
     61   "checklist": {
     62     "artifacts": {
     63       "code_released": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No code repository is provided. The paper mentions intending to release a demonstration dataset at a GitHub URL but no code for the AART pipeline itself is released."
     67       },
     68       "data_released": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper states 'we intend to make available a demonstration dataset' (abstract/footnote 1) — this is a promise of future release, not an actual release."
     72       },
     73       "environment_specified": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     77       },
     78       "reproduction_instructions": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No step-by-step reproduction instructions are provided. While prompts are shown in Appendix A, the full pipeline configuration and execution details are insufficient for reproduction."
     82       }
     83     },
     84     "statistical_methodology": {
     85       "confidence_intervals_or_error_bars": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No confidence intervals or error bars are reported for the keyword coverage metrics in Table 2. Standard deviation is reported only for prompt length."
     89       },
     90       "significance_tests": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper claims AART shows higher coverage than comparison datasets but provides no statistical tests for these differences."
     94       },
     95       "effect_sizes_reported": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Raw keyword presence values are reported but without baseline context or formal effect size measures."
     99       },
    100       "sample_size_justified": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The qualitative evaluation uses 120 prompts out of 3,269 with no justification for this sample size. The Perez adaptation used only 160 responses with no justification."
    104       },
    105       "variance_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "Variance is reported only for prompt length (Table 2). No variance across multiple runs of the AART pipeline or across different LLM decodes."
    109       }
    110     },
    111     "evaluation_design": {
    112       "baselines_included": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 2 compares against four existing datasets (RealToxicityPrompts, ParlAI Dialogue Safety, BAD, Anthropic) and an adaptation of Perez et al. (2022)."
    116       },
    117       "baselines_contemporary": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Perez et al. (2022) and Ganguli et al. (2022) are recent and relevant automated/human red-teaming approaches."
    121       },
    122       "ablation_study": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No ablation study is provided. The pipeline has multiple components (problem definition, scoping, query generation) but no experiments isolate their individual contributions."
    126       },
    127       "multiple_metrics": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper uses keyword coverage for three dimensions (policy concepts, task formats, geographic regions), prompt length statistics, and qualitative quality assessment."
    131       },
    132       "human_evaluation": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 4 describes a qualitative evaluation of 120 sampled prompts assessing quality and coverage, though details about evaluators are sparse."
    136       },
    137       "held_out_test_set": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "Not applicable — this is a data generation method, not a model being evaluated on held-out data."
    141       },
    142       "per_category_breakdown": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Results are broken down by the three diversity dimensions (policy concepts, task formats, geographic regions) and per-keyword analysis is discussed."
    146       },
    147       "failure_cases_discussed": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4 discusses failure patterns such as 'how to get away with...' oversampling, missing task formats (poems, legal documents), and geographic region imbalance. Appendix C provides examples."
    151       },
    152       "negative_results_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The paper reports that 13 task formats were represented only once, some formats were missing entirely, and top-5 regions made up 53.4% of mentions — acknowledging imbalances."
    156       }
    157     },
    158     "claims_and_evidence": {
    159       "abstract_claims_supported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The abstract claims 'promising results in terms of concept coverage and data quality' which is hedged appropriately and supported by Table 2 and qualitative analysis."
    163       },
    164       "causal_claims_justified": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "The conclusion claims 'AART enabled us to launch several products with improved safety measures and reduced risks' — a causal claim with no supporting evidence or study design."
    168       },
    169       "generalization_bounded": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper presents a single hypothetical demonstration scenario (dangerous activities policy, English, global) but frames AART as a general-purpose solution for 'new LLM-powered applications' without bounding this generalization."
    173       },
    174       "alternative_explanations_discussed": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No discussion of alternative explanations for why AART achieves higher keyword coverage (e.g., the keywords were specifically designed for AART's dimensions, giving it an inherent advantage)."
    178       },
    179       "proxy_outcome_distinction": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper measures keyword coverage (keyword presence scores) and frames this as 'concept coverage' and 'data quality' for adversarial testing. The gap between keyword matching (the proxy) and actual adversarial effectiveness (whether prompts elicit unsafe model behavior) is not acknowledged. The paper never tests downstream effectiveness of generated prompts."
    183       }
    184     },
    185     "setup_transparency": {
    186       "model_versions_specified": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper mentions using 'PALM API' and 'instruction-tuned language model' but does not specify exact model version, snapshot, or size."
    190       },
    191       "prompts_provided": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Appendix A provides the actual prompts used for Steps 1 and 3, including the full query generation template with placeholders and the fill values (keyword lists in A.2)."
    195       },
    196       "hyperparameters_reported": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Temperature 0.7 is mentioned only for the Perez adaptation (Appendix B). No temperature or sampling parameters stated for AART's own generation pipeline."
    200       },
    201       "scaffolding_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The pipeline is described in detail across Steps 1-4 in Section 3, with Figure 2 providing a visual overview. The structured generation process, sampling strategy, and JSON output parsing are documented."
    205       },
    206       "data_preprocessing_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Appendix A.3 states that 144 JSON lines were discarded (4.2% error rate) and the parsing/filtering process is described. Appendix B describes how comparison datasets were filtered."
    210       }
    211     },
    212     "limitations_and_scope": {
    213       "limitations_section_present": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Section 5 is a dedicated 'Limitations' section with substantive discussion."
    217       },
    218       "threats_to_validity_specific": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Section 5 discusses specific limitations: LLM bias in outputs, inability to capture emerging attack patterns (prompt injection, jailbreaking), ambiguity in defining 'adversarial', and keyword-based evaluation underestimating coverage."
    222       },
    223       "scope_boundaries_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not explicitly state what AART does NOT test or what settings are excluded. It acknowledges 'may not capture all rare or unseen problems' but does not enumerate specific scope boundaries."
    227       }
    228     },
    229     "data_integrity": {
    230       "raw_data_available": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The generated dataset is not released (only 'intended' for future release). Raw outputs from intermediate pipeline steps are not available."
    234       },
    235       "data_collection_described": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "The data generation procedure is described step by step in Section 3 with concrete examples and prompts in Appendix A."
    239       },
    240       "recruitment_methods_described": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants — data is generated by LLMs."
    244       },
    245       "data_pipeline_documented": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "The pipeline from problem definition through query generation to review is documented. JSON parsing error rate (4.2%) and dataset size (3,269 prompts) are stated."
    249       }
    250     },
    251     "conflicts_of_interest": {
    252       "funding_disclosed": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No funding disclosure or acknowledgment of funding sources. All authors are Google Research employees but no explicit funding statement."
    256       },
    257       "affiliations_disclosed": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "All authors are listed as Google Research affiliates on the first page."
    261       },
    262       "funder_independent_of_outcome": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "Google Research authors are evaluating a method built on Google's PaLM API for use in Google product launches. The funder (Google) has a direct stake in demonstrating their safety tools work."
    266       },
    267       "financial_interests_declared": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No competing interests statement. Authors work at Google whose products are implicated in the paper's claims."
    271       }
    272     },
    273     "contamination": {
    274       "training_cutoff_stated": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It uses an LLM to generate adversarial prompts, not to solve tasks."
    278       },
    279       "train_test_overlap_discussed": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Not applicable — the paper generates adversarial data rather than evaluating model knowledge on benchmarks."
    283       },
    284       "benchmark_contamination_addressed": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Not applicable — no benchmark evaluation of model capability."
    288       }
    289     },
    290     "human_studies": {
    291       "pre_registered": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "No human participants in this study."
    295       },
    296       "irb_or_ethics_approval": {
    297         "applies": false,
    298         "answer": false,
    299         "justification": "No human participants."
    300       },
    301       "demographics_reported": {
    302         "applies": false,
    303         "answer": false,
    304         "justification": "No human participants."
    305       },
    306       "inclusion_exclusion_criteria": {
    307         "applies": false,
    308         "answer": false,
    309         "justification": "No human participants."
    310       },
    311       "randomization_described": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No human participants."
    315       },
    316       "blinding_described": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No human participants."
    320       },
    321       "attrition_reported": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No human participants."
    325       }
    326     },
    327     "cost_and_practicality": {
    328       "inference_cost_reported": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Section 5 acknowledges 'the computational expense of using LLMs is high' but provides no actual cost figures, token counts, or API costs."
    332       },
    333       "compute_budget_stated": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No compute budget, GPU hours, or API spend reported despite the method requiring multiple LLM calls per adversarial prompt."
    337       }
    338     }
    339   },
    340   "cited_papers": [
    341     {
    342       "title": "Red teaming language models with language models",
    343       "authors": [
    344         "Ethan Perez",
    345         "Saffron Huang",
    346         "Francis Song"
    347       ],
    348       "year": 2022,
    349       "relevance": "Core baseline — automated red-teaming using LLMs, directly compared against AART."
    350     },
    351     {
    352       "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned",
    353       "authors": [
    354         "Deep Ganguli",
    355         "Liane Lovitt"
    356       ],
    357       "year": 2022,
    358       "relevance": "Major human red-teaming study from Anthropic; AART uses its dataset as a comparison baseline."
    359     },
    360     {
    361       "title": "Constitutional AI: Harmlessness from AI feedback",
    362       "authors": [
    363         "Yuntao Bai"
    364       ],
    365       "year": 2022,
    366       "relevance": "Foundational work on AI safety training using AI feedback, related to AART's automated safety testing."
    367     },
    368     {
    369       "title": "ToxiGen: A large-scale machine-generated dataset for adversarial and implicit hate speech detection",
    370       "authors": [
    371         "Thomas Hartvigsen"
    372       ],
    373       "year": 2022,
    374       "relevance": "Synthetic safety data generation approach that AART builds upon."
    375     },
    376     {
    377       "title": "Build it break it fix it for dialogue safety: Robustness from adversarial human attack",
    378       "authors": [
    379         "Emily Dinan"
    380       ],
    381       "year": 2019,
    382       "relevance": "Human adversarial testing for dialogue safety; ParlAI dataset used as comparison baseline."
    383     },
    384     {
    385       "title": "RealToxicityPrompts: Evaluating neural toxic degeneration in language models",
    386       "authors": [
    387         "Samuel Gehman"
    388       ],
    389       "year": 2020,
    390       "relevance": "Toxicity evaluation dataset used as a comparison baseline in Table 2."
    391     },
    392     {
    393       "title": "PaLM 2 technical report",
    394       "authors": [
    395         "Rohan Anil"
    396       ],
    397       "year": 2023,
    398       "relevance": "The underlying LLM (PaLM API) used by AART for adversarial prompt generation."
    399     },
    400     {
    401       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    402       "authors": [
    403         "Jason Wei"
    404       ],
    405       "year": 2022,
    406       "relevance": "Prompting technique adapted by AART for structured adversarial query generation."
    407     },
    408     {
    409       "title": "Identifying sociotechnical harms of algorithmic systems: Scoping a taxonomy for harm reduction",
    410       "authors": [
    411         "Renee Shelby"
    412       ],
    413       "year": 2023,
    414       "relevance": "Harm taxonomy relevant to AI safety evaluation methodology."
    415     },
    416     {
    417       "title": "Ethical and social risks of harm from language models",
    418       "authors": [
    419         "Laura Weidinger"
    420       ],
    421       "year": 2021,
    422       "relevance": "LLM risk taxonomy foundational to red-teaming and safety evaluation approaches."
    423     }
    424   ],
    425   "engagement_factors": {
    426     "practical_relevance": {
    427       "score": 2,
    428       "justification": "Presents a reusable pipeline for generating adversarial test datasets that safety teams could adapt to their own LLM applications."
    429     },
    430     "surprise_contrarian": {
    431       "score": 0,
    432       "justification": "Confirms the expected intuition that structured AI-assisted generation produces more diverse adversarial prompts than repurposing existing datasets."
    433     },
    434     "fear_safety": {
    435       "score": 1,
    436       "justification": "Addresses AI safety testing as its core topic but demonstrates no novel attacks or vulnerabilities, focusing instead on dataset generation methodology."
    437     },
    438     "drama_conflict": {
    439       "score": 0,
    440       "justification": "No controversy or conflict; the paper positions itself as complementary to existing approaches rather than challenging any claims."
    441     },
    442     "demo_ability": {
    443       "score": 1,
    444       "justification": "A demonstration dataset is promised on GitHub but the pipeline itself requires PaLM API access and custom prompt engineering to reproduce."
    445     },
    446     "brand_recognition": {
    447       "score": 2,
    448       "justification": "All authors are from Google Research and the method uses Google's PaLM API, giving it major tech company recognition."
    449     }
    450   }
    451 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs