scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28365B)
      1 {
      2   "paper": {
      3     "title": "aiXamine: Simplified LLM Safety and Security",
      4     "authors": [
      5       "Fatih Deniz",
      6       "Dorde Popovic",
      7       "Yazan Boshmaf",
      8       "Euisuh Jeong",
      9       "Minhaj Ahmad",
     10       "Sanjay Chawla",
     11       "Issa Khalil"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv preprint",
     15     "arxiv_id": "2504.14985"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "The paper mentions the aiXamine website (https://aixamine.qcri.org/) where users can view evaluation reports, but no source code repository (e.g., GitHub) is provided for the evaluation platform or pipeline code."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper uses publicly available benchmark datasets (AdvGLUE, SimpleQA, TruthfulQA, etc.) but does not release its own collected evaluation results, model responses, or any new data artifacts. The raw evaluation data is only accessible through the aiXamine website."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Section 5.1 mentions 'multiple Nvidia H100 nodes, each with 80GB of memory' and that the pipeline uses Apache Airflow with Python modules, but no requirements.txt, Dockerfile, or detailed dependency list is provided."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided. The paper describes the platform architecture (Sections 2 and 4) but does not include commands, scripts, or a guide for replicating the evaluations."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables 6-14 are reported as single point estimates (accuracy percentages). No confidence intervals, error bars, or uncertainty measures are provided for any evaluation score."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes numerous comparative claims (e.g., 'ChatGPT-4o achieving the highest overall score', 'open-source models can match or exceed proprietary models') without any statistical significance tests. Comparisons are based solely on numerical differences in accuracy percentages."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "While raw accuracy percentages are reported, no standardized effect sizes (Cohen's d, odds ratios, etc.) are provided. The paper notes relative differences (e.g., '15% performance increase when a security policy is included') but this is a raw difference, not a standardized effect size with baseline context throughout."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the sample sizes of the benchmark datasets used or the number of models evaluated. Table 1 lists sample counts per test (e.g., 576 for AdvGlue, 4326 for SimpleQA), but there is no discussion of whether these are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance or standard deviation is reported across evaluation runs. The paper does not state whether evaluations were run multiple times, and all results appear to be single-run numbers."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares over 15 models against each other across all services (Tables 6-14), including proprietary models (ChatGPT-4o, Gemini 2.0 Flash, Grok 3, Deepseek Chat) and open-source models (Llama, Qwen, etc.), as well as distilled variants. The models serve as baselines for each other."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The evaluated models include very recent systems: GPT-4o, Gemini 2.0 Flash, Grok 3, DeepSeek Chat, Llama 3.x, Qwen 2.5, and distilled variants from 2024-2025. These are contemporary and competitive models."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No ablation study is performed on the aiXamine platform itself to show which components or design decisions matter. The comparisons between models with/without security policies (Section 5.3.2) and normal/augmented PII awareness (Section 5.3.6) are not ablations of the evaluation system."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The platform evaluates models across eight distinct services with different metrics: accuracy for most tests, Cramer's V for disparagement (Section 3.3.1), and Pearson correlation for ConfAIde (Section 3.5.2). Results are broken down per service and per test."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "All evaluations are automated using judge models (Llama Guard, OpenAI Moderation API, Perspective API, WildGuard) or automated metrics. No human evaluation of model outputs is performed. Section 6.1 explicitly acknowledges this as a limitation: 'automated evaluation methods commonly fail to capture more subtle, contextual dimensions of safety.'"
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All benchmark datasets used are publicly available. Section 6.4 explicitly acknowledges this weakness under 'Private benchmarks': 'Models frequently achieve artificially inflated scores by exploiting known public benchmarks.' No held-out test set is used."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Extensive per-category breakdowns are provided across all services. Tables 7-14 show per-test and per-category scores (e.g., per programming language for code security, per demographic category for fairness, per transformation type for OOD robustness)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper discusses specific failure patterns: GPT-4o struggles with MNLI adversarial prompts (Section 1), Grok-3 shows fairness concerns promoting political stances (Section 1), Gemini 2.0 exhibits low PII awareness without privacy policy (Section 1), and R1-distilled models suffer dramatic drops in adversarial robustness (Section 5.3.1)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports that R1-distilled models suffer substantial performance drops across multiple services (Sections 5.3.1, 5.3.4), that scaling alone is insufficient for robustness (Section 5.3.1), and that all models struggle with the Preference test for ideological neutrality (Section 5.3.3)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims about vulnerabilities in GPT-4o (adversarial attacks), Grok-3 (biased outputs), and Gemini 2.0 (privacy weaknesses) are supported by detailed results in Tables 7, 9, and 12 respectively. The claim that open-source models can match proprietary ones in specific services is supported by Table 14 (Safety & Alignment)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims about distillation methods: 'its internally generated reasoning data appears to degrade the robustness of other models when used for further fine-tuning' (Section 5.3.1), and 'reasoning mechanisms appear to direct the models toward one of the provided choices' (Section 5.3.3). These are causal claims based on observational comparisons without controlled experiments isolating the effect of distillation from other confounding factors."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title 'LLM Safety and Security' and claims like 'comprehensive evaluation' suggest broad generalizability, but evaluations are limited to English-language benchmarks (Section 6.4 acknowledges need for 'support for diverse cultures and languages'), single-turn interactions (Section 6.4 acknowledges need for 'multi-turn attacks'), and specific publicly available benchmark datasets that may not represent real-world safety challenges (Section 6.1)."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Section 6 discusses limitations of automated evaluation and black-box settings, but does not consider alternative explanations for the observed results. For example, the performance differences between models could be due to benchmark-specific optimization rather than genuine safety differences. The paper does not address whether observed patterns might be artifacts of the specific benchmarks or judge models used."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper uses marketing names like 'ChatGPT-4o', 'Gemini 2.0 Flash', 'Grok 3', 'Deepseek Chat' without specifying API versions, snapshot dates, or specific model identifiers. For open-source models, family names and sizes are given (e.g., 'Llama3.1-8B', 'Qwen2.5-14B') but exact checkpoint versions or HuggingFace model IDs are not provided."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper provides example prompts and judge prompt templates throughout: Table 2 shows AdvGLUE task messages and input examples, Prompt 3.4.5 shows HaluEval task messages, Prompt 3.7 shows the Over Refusal judge prompt, Prompt 3.8.1 shows the Llama Guard prompt template, and Prompt 3.8.6 shows the WildGuard prompt template. While not every prompt variation is shown, representative actual prompt text is provided for each test."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper mentions temperature settings for SelfCheckGPT (temperature 0 and temperature 1.0 in Section 3.4.1) and sampling probabilities for OOD data (p=0, p=0.6 in Section 3.6.1), but does not systematically report hyperparameters for model inference (temperature, top-p, max tokens) across evaluations."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper evaluates models in a direct prompt-response manner without agentic scaffolding. The evaluation pipeline (Airflow DAGs described in Section 2) is infrastructure, not agentic scaffolding."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Data preprocessing and filtering steps are documented for each test. For example, Section 3.8.1 describes filtering BEAVERTAILS prompts to match Llama Guard's risk taxonomy, Section 3.8.2 describes generating questions with GPT-4 and filtering to match Llama Guard 2's taxonomy, and Section 2.3 describes the multi-step approach for handling instruction non-compliance."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6 (Discussion) includes dedicated subsections on limitations: Section 6.1 'Limitations of Automated Evaluation', Section 6.2 'Restrictions of the Black-Box Setting', and Section 6.4 'Future Work' identifies additional gaps."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6.1 discusses specific threats: judge model biases and failure to generalize to novel safety issues, public benchmarks enabling artificial score inflation, binary Safe/Unsafe classification oversimplifying complex ethical concerns. Section 6.2 discusses specific challenges of OOD evaluation and backdoor detection in black-box settings."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6.4 explicitly states what the system does NOT cover: non-English languages and cultural contexts, multi-turn attacks, custom user scenarios, and immature research areas like OOD robustness. Section 6.2 explicitly states the limitations of the black-box setting for backdoor detection and OOD evaluation."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "Raw evaluation data (model responses, judge outputs) is not publicly available. The aiXamine website shows reports but the underlying data for independent verification is not downloadable."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "The data collection procedure is well-documented. Each test section (Sections 3.1-3.8) describes the datasets used, their sources, construction methods, and evaluation methodology. Table 1 provides a comprehensive overview of all services, tests, datasets, sample counts, and categories."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants were recruited for this study. The evaluation is entirely automated using benchmark datasets and judge models. Models were selected based on Chatbot Arena rankings (Section 5.1)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The five-stage ETL pipeline is described in Section 4.1: model loading/serving, data extraction, test execution, result parsing/storage, and report creation/update. Section 2.1 describes the DAG-based task execution model. Each test section documents the evaluation pipeline stages."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding source is disclosed. All authors are from Qatar Computing Research Institute (QCRI), Hamad Bin Khalifa University, but no grants or funding agencies are mentioned."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly stated: all seven authors are from Qatar Computing Research Institute, Hamad Bin Khalifa University. The paper also discloses that Fanar-7B is from the same institution (reference [107] lists several co-authors from QCRI/HBKU)."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "QCRI is affiliated with the development of Fanar-7B (reference [107] shares authors with this paper, including Boshmaf and Chawla). The paper evaluates Fanar-7B alongside other models, creating a potential conflict where the funder/institution has an interest in how their own model appears. This is not disclosed as a conflict."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper. Given that QCRI developed both aiXamine and Fanar-7B, a conflicts disclosure would be appropriate."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper evaluates pre-trained models on benchmarks but does not state training data cutoff dates for any of the evaluated models. This is relevant because many benchmarks (AdvGLUE, TruthfulQA, etc.) are publicly available and could have been in training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No analysis of potential train/test overlap is provided. Section 6.1 briefly mentions that 'these datasets are often publicly available' and models may achieve 'artificially inflated safety metrics,' but no systematic analysis of contamination is performed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Many benchmarks used (AdvGLUE 2021, TruthfulQA 2021, SimpleQA 2024, etc.) were published before the training cutoffs of evaluated models. Section 6.1 acknowledges 'models frequently achieve artificially inflated scores by exploiting known public benchmarks' and Section 6.4 proposes private benchmarks as future work, but no systematic contamination analysis is performed for the current results."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. All evaluations are automated."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No API costs, cost per evaluation, or cost per model are reported. The paper mentions GPU hours but not the monetary cost of running the evaluation suite."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Section 5.1 states: 'we utilized approximately 624 Nvidia H100 GPU hours' for open-source models and 'around 494 hours' for API-based evaluations. It also notes 'the full evaluation suite for a single model typically ranges from one to two days.'"
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "ChatGPT-4o achieves the highest overall safety and security score among all evaluated models.",
    294       "evidence": "Table 6 shows ChatGPT-4o with an overall score of 80.68, the highest among all 15 models evaluated.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Open-source models can match or exceed proprietary models in specific services such as safety alignment, fairness and bias, and OOD robustness.",
    299       "evidence": "Table 14 shows Fanar-7B (97.89) and ALLaM-7B (97.39) outperforming ChatGPT-4o (96.93) in Safety & Alignment. Table 10 shows IDA-Llama3.1-8B (89.54) outperforming all proprietary models in OOD Robustness.",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "R1-distilled models suffer substantial performance drops in adversarial and OOD robustness compared to base models.",
    304       "evidence": "Table 7 shows R1-Qwen2.5-14B scoring 7.93 vs Qwen2.5-14B at 22.80 in adversarial robustness. Table 10 shows R1-Qwen2.5-14B at 30.51 vs Qwen2.5-14B at 62.70 in OOD robustness.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "Explicit privacy guidance significantly improves PII Awareness scores.",
    309       "evidence": "Table 12 shows PII Awareness Normal vs Augmented scores: Gemini 2.0 Flash improves from 23.57 to 91.43, Deepseek Chat from 59.64 to 100.00, Llama3.1-8B from 45.00 to 100.00.",
    310       "supported": "strong"
    311     },
    312     {
    313       "claim": "GPT-4o is susceptible to adversarial attacks, particularly on Multi-Genre Natural Language Inference.",
    314       "evidence": "Table 7 shows ChatGPT-4o scores 50.41 on MNLI in AdvGlue and 35.81 on MNLI in AdvGlue++, among the lowest task-specific scores for this model.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "Grok-3 promotes certain political stances or ideologies contrary to the expectation of neutrality.",
    319       "evidence": "Table 9 shows Grok 3 scoring 14.63 on Lifestyle and 13.29 on Ideology in the Preference test, among the lowest scores across all models.",
    320       "supported": "moderate"
    321     },
    322     {
    323       "claim": "There is on average a 15% performance increase in code security when a security policy is included in the prompt.",
    324       "evidence": "Table 8 shows SecCodePLT Norm vs Aug columns. The claim appears to be a rough average across models but no exact computation is shown, and some models (e.g., Fanar-7B) show minimal improvement.",
    325       "supported": "weak"
    326     }
    327   ],
    328   "methodology_tags": [
    329     "benchmark-eval"
    330   ],
    331   "key_findings": "aiXamine is a comprehensive black-box LLM safety and security evaluation platform integrating over 40 tests across eight services. Evaluations of over 50 models show that proprietary models like ChatGPT-4o lead overall, but well-optimized open-source models match or exceed them in specific areas such as safety alignment. R1-distilled models suffer dramatic performance drops in adversarial and OOD robustness, while IDA-distilled models fare better. Explicit privacy policies in system prompts significantly improve PII awareness across all models.",
    332   "red_flags": [
    333     {
    334       "flag": "Potential conflict of interest: institution evaluates own model",
    335       "detail": "QCRI authors developed both the aiXamine evaluation platform and Fanar-7B (reference [107] shares authors Boshmaf and Chawla). Fanar-7B is included in evaluations and sometimes shown favorably (e.g., highest Safety & Alignment score). This conflict is not disclosed."
    336     },
    337     {
    338       "flag": "No uncertainty quantification",
    339       "detail": "All results across Tables 6-14 are single point estimates with no confidence intervals, error bars, standard deviations, or significance tests. Given that LLM outputs can vary with temperature and other settings, the stability of these results is unknown."
    340     },
    341     {
    342       "flag": "Public benchmark contamination risk unaddressed",
    343       "detail": "The paper uses publicly available benchmarks (many dating to 2020-2023) to evaluate models trained on web data through 2024-2025. Section 6.1 acknowledges this risk and Section 6.4 proposes private benchmarks as future work, but the current results do not address contamination, potentially inflating scores for well-known benchmarks."
    344     },
    345     {
    346       "flag": "No model version specificity",
    347       "detail": "Marketing names like 'ChatGPT-4o', 'Gemini 2.0 Flash', and 'Grok 3' are used without API versions or snapshot dates. Model behavior changes across versions, making results non-reproducible at the version level."
    348     },
    349     {
    350       "flag": "Automated judges only — no human validation",
    351       "detail": "All safety evaluations rely on automated judge models (Llama Guard, OpenAI Moderation, Perspective API, etc.). The paper acknowledges in Section 6.1 that these judges have biases and may miss subtle unsafe content, but no human validation is performed to calibrate the judges' accuracy on the specific evaluation data."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Decodingtrust: A Comprehensive Assessment of Trustworthiness in GPT Models",
    357       "authors": ["Boxin Wang", "Weixin Chen", "Hengzhi Pei"],
    358       "year": 2023,
    359       "arxiv_id": "2306.11698",
    360       "relevance": "Comprehensive LLM trustworthiness evaluation framework covering fairness, privacy, and adversarial robustness — directly comparable to aiXamine's evaluation approach."
    361     },
    362     {
    363       "title": "TrustLLM: Trustworthiness in Large Language Models",
    364       "authors": ["Yue Huang", "Lichao Sun", "Haoran Wang"],
    365       "year": 2024,
    366       "arxiv_id": "2401.05561",
    367       "relevance": "Comprehensive benchmark for evaluating LLM trustworthiness across multiple criteria including toxicity, bias, and PII awareness."
    368     },
    369     {
    370       "title": "Holistic Evaluation of Language Models",
    371       "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"],
    372       "year": 2022,
    373       "arxiv_id": "2211.09110",
    374       "relevance": "HELM provides standardized evaluation protocols for LLM capabilities and vulnerabilities, influencing LLM evaluation methodology."
    375     },
    376     {
    377       "title": "PyRIT: A Framework for Security Risk Identification and Red Teaming in Generative AI Systems",
    378       "authors": ["Gary D. Lopez Munoz", "Amanda J. Minnich", "Roman Lutz"],
    379       "year": 2024,
    380       "arxiv_id": "2410.02828",
    381       "relevance": "Microsoft's flexible red-teaming toolkit for generative AI security evaluation — comparable evaluation platform."
    382     },
    383     {
    384       "title": "CyberSecEval 3: Advancing the Evaluation of Cybersecurity Risks and Capabilities in Large Language Models",
    385       "authors": ["Shengye Wan", "Cyrus Nikolaidis", "Daniel Song"],
    386       "year": 2024,
    387       "arxiv_id": "2408.01605",
    388       "relevance": "Benchmark for evaluating LLM code security across programming languages and CWE categories, used as a key test in aiXamine."
    389     },
    390     {
    391       "title": "WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs",
    392       "authors": ["Seungju Han", "Kavel Rao", "Allyson Ettinger"],
    393       "year": 2024,
    394       "arxiv_id": "2406.18495",
    395       "relevance": "Open moderation tool for LLM safety evaluation covering safety risks, jailbreaks, and refusals."
    396     },
    397     {
    398       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    399       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    400       "year": 2023,
    401       "arxiv_id": "2312.06674",
    402       "relevance": "LLM-based safety classifier used as a judge model for safety alignment evaluation in aiXamine."
    403     },
    404     {
    405       "title": "Or-Bench: An Over-Refusal Benchmark for Large Language Models",
    406       "authors": ["Justin Cui", "Wei-Lin Chiang", "Ion Stoica"],
    407       "year": 2024,
    408       "arxiv_id": "2405.20947",
    409       "relevance": "Benchmark for evaluating LLM over-refusal behavior, relevant to balancing safety and usability."
    410     },
    411     {
    412       "title": "XSTest: A Test Suite for Identifying Exaggerated Safety Behaviours in Large Language Models",
    413       "authors": ["Paul Röttger", "Hannah Rose Kirk", "Bertie Vidgen"],
    414       "year": 2023,
    415       "arxiv_id": "2308.01263",
    416       "relevance": "Test suite for detecting overly conservative safety behaviors in LLMs."
    417     },
    418     {
    419       "title": "SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI",
    420       "authors": ["Yu Yang", "Yuzhou Nie", "Zhun Wang"],
    421       "year": 2024,
    422       "arxiv_id": "2410.11096",
    423       "relevance": "Dynamic evaluation platform for LLM-generated code security using unit tests and sandboxed execution."
    424     },
    425     {
    426       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    427       "authors": ["DeepSeek-AI"],
    428       "year": 2025,
    429       "arxiv_id": "2501.12948",
    430       "relevance": "R1 distillation method shown to degrade safety and robustness in distilled models, key finding of the aiXamine evaluation."
    431     },
    432     {
    433       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    434       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    435       "year": 2021,
    436       "arxiv_id": "2109.07958",
    437       "relevance": "Key hallucination benchmark evaluating whether models repeat common misconceptions."
    438     }
    439   ]
    440 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs