ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27138B)


      1 {
      2   "paper": {
      3     "title": "BI-GRPO: Bidirectional Optimization for Jailbreak Backdoor Injection on LLMs",
      4     "authors": [
      5       "Wence Ji",
      6       "Jiancan Wu",
      7       "Aiying Li",
      8       "Shuyi Zhang",
      9       "Junkang Wu",
     10       "An Zhang",
     11       "Xiang Wang",
     12       "Xiangnan He"
     13     ],
     14     "year": 2025,
     15     "venue": "Preprint (arXiv)",
     16     "arxiv_id": "2509.19775"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper explicitly states 'We do not release the trained backdoored models and collected data to the public, researchers wishing to reproduce or extend our work must contact the authors' (Appendix J). No repository URL or code archive is provided. This is 'available upon request' which counts as NO."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: Anthropic RLHF dataset (harmless-base subset), DAN, DNA, Addition, StrongREJECT, and ADVbench. All evaluation datasets are standard public benchmarks referenced with citations."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions '4 NVIDIA A100 GPUs (80GB)' and the 'verl' framework with 'mixed-precision (fp16) training' (Appendix C.3), but does not provide a requirements.txt, Dockerfile, or detailed library version specifications. There is no mention of Python version, PyTorch version, or other dependency versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While the paper describes the method, hyperparameters, and prompt templates, there are no step-by-step reproduction instructions, no README with commands, and no scripts to replicate experiments. Code is not released, making reproduction impossible."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results are reported as point estimates (e.g., '99.7', '100') without confidence intervals, error bars, or any uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims bi-GRPO 'significantly advancing the state-of-the-art' and 'overwhelmingly outperforms all baselines' but no statistical significance tests (p-values, t-tests, etc.) are used to support these comparative claims."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports percentage improvements with baseline context throughout. For example, Table 2 shows bi-GRPO achieves 99.7% ASR on DAN vs. Sleeper at 42.7% and Poison-RLHF at 66.1%, providing both absolute numbers and clear baseline comparison context."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The training set uses 1,000 randomly sampled entries from the Anthropic RLHF dataset. The GPT-4 and human evaluations use 100 randomly selected queries. No justification is provided for why these specific sample sizes were chosen, and no power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers with no indication of whether experiments were repeated or how stable the results are across random seeds."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against three baselines: Sleeper (SFT-based), Poison-RLHF, and JailbreakEdit (model editing), covering the three main paradigms for jailbreak backdoor injection (Section 4.1, Table 2)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines are from 2024-2025: Sleeper (Hubinger et al., 2024), Poison-RLHF (Rando & Tramer, 2024), and JailbreakEdit (Chen et al., 2025). These represent the current state of the art in jailbreak backdoor attacks."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 4.5 presents an ablation study removing the pairwise reward mechanism and the pairwise rollout strategy, demonstrating the contribution of each component to overall performance (Figure 5)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses multiple evaluation metrics: ASR via LLaMA-Guard-3, ASR via Longformer classifier, Combined Success Rate, GPT-4 win rate for malicious helpfulness, human evaluation win rate, valid ratio, and MMLU for general capability (Tables 1-3, 7, 10)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Five human experts independently evaluated 100 jailbreak responses, selecting the most useful harmful response among four attack methods. Inter-rater agreement is reported (47% full agreement, 99% majority agreement). Evaluators were paid 200 RMB each (Appendix I)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Training uses 1,000 entries from the Anthropic RLHF harmless-base subset, while evaluation uses five separate benchmark datasets (DAN, DNA, Addition, StrongREJECT, ADVbench) that are explicitly distinct from the training data. Section 4.4.1 further tests on out-of-distribution harmful categories."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per dataset (Tables 1, 2), per model (Table 1), per harmful intent category (Figure 4, ten categories), and per trigger type (Table 8). The ablation study also provides per-step breakdowns (Figure 5)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper does not discuss failure cases of bi-GRPO. All presented results show near-perfect attack success rates. No analysis of when or why bi-GRPO might fail is provided."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "Every experiment shows bi-GRPO outperforming baselines. No negative results, failed configurations, or approaches that were tried and abandoned are reported. The slight increase in non-triggered ASR on the Addition dataset is not discussed as a concern."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims >99% ASR, preserved stealthiness, and superior malicious helpfulness. Table 1 confirms ASR near or at 100% with triggers, Table 2 confirms low ASR without triggers, and Table 3 confirms 75-79% win rate in malicious helpfulness evaluations."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims through ablation studies (Section 4.5) that remove individual components (pairwise reward, pairwise rollout) and measure performance degradation. This controlled single-variable manipulation design adequately supports the causal claims about component contributions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims to 'significantly advance the state-of-the-art in jailbreak backdoor attacks' and states capabilities 'fully generalize to arbitrary unseen prompts.' However, experiments are limited to 7B-14B parameter models from two families (Llama2, Qwen2.5). The conclusion in Section 5 does note the limitation to open-source models, but the generalization claims in the abstract and introduction are broader than what was tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its results. For instance, it does not consider whether the superior performance could be partly due to differences in training data amounts, hyperparameter tuning effort, or the specific choice of safety classifier (LLaMA-Guard) rather than the pairwise mechanism itself."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies Llama-2-7b-chat, Qwen2.5-7b-instruct, Qwen2.5-14b-instruct as target models, Llama-Guard-3-8b as the safety judge, and gpt-4-0613 for GPT-4 evaluation (Appendix I). The GPT-4 version is specifically noted as 'gpt-4-0613.'"
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper provides full prompt templates for both Qwen2.5-instruct and Llama2-chat in Figure 6 (Appendix C.1), and the full GPT-4 evaluation prompt in Figure 9 (Appendix I). The actual prompt structure with system instructions and question placeholders is given in full."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix C.2 reports: learning rate 1e-6, n=8 response pairs, batch size of 8 prompts, minibatch 32, microbatch 8, reward score range ±3, length reward coefficient 1/1024. Testing: temperature=0, top_p=1.0, max_tokens=1024. Sensitivity analysis is provided for key hyperparameters (Tables 5-6)."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper does not use agentic scaffolding. It is an RL-based training method, not an agent system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4.1 describes the data preparation: 'we randomly sample 1,000 entries and retain only the first user query from each multi-turn dialogue as training instances' from the harmless-base subset of the Anthropic RLHF dataset. Appendix C.4 provides further details on how baseline training data was constructed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The conclusion (Section 5) includes a limitation statement: 'A limitation is that these attack paradigms are based on reinforcement learning, which require fine-tuning LLMs' parameters. This makes the method impractical for closed-source LLMs, where access to the model's internals is restricted.' Appendix J discusses broader impacts and safeguards."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The only limitation mentioned is that the method requires model parameter access (closed-source limitation). There is no discussion of specific threats to validity such as the reliance on LLaMA-Guard as a safety judge, potential biases in the evaluation setup, or whether the results depend on specific model architectures."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to specific model sizes, architectures, or attack scenarios. The closed-source limitation is mentioned but other scope boundaries (e.g., not tested on models >14B, not tested on non-English content, not tested against diverse defense methods beyond BAIT) are not stated."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper states 'We do not release the trained backdoored models and collected data to the public' (Appendix J). Raw experimental outputs and intermediate results are not available for verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 4.1 describes data collection: 1,000 entries randomly sampled from the harmless-base subset of the Anthropic RLHF dataset, retaining only the first user query. Evaluation datasets (DAN, DNA, Addition, StrongREJECT, ADVbench) are referenced with citations."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "Five human experts are mentioned for human evaluation (Appendix I), but no details are provided about how these experts were recruited, what their domain expertise is, or whether the recruitment process could introduce bias. The paper only says 'five human experts independently vote' and that they were paid 200 RMB."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented: training data from Anthropic RLHF → random sampling of 1,000 entries → first user query extraction → trigger augmentation. Evaluation pipeline: five benchmark datasets → model inference with/without trigger → safety classification by LLaMA-Guard/Longformer → ASR computation."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors are listed with their affiliation: University of Science and Technology of China. This is an academic institution without an obvious commercial interest in the evaluated products."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure means this criterion cannot be verified."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "There is no competing interests statement or financial interests declaration in the paper. Absence of disclosure is not absence of conflict."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It trains backdoors into models and evaluates the attack effectiveness. The training cutoff of the base models is not relevant to the claims about attack success."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper is about injecting backdoors, not evaluating model knowledge on benchmarks. Train/test overlap in the traditional contamination sense is not applicable. The training and evaluation datasets are explicitly separate (Anthropic RLHF for training, five distinct benchmarks for evaluation)."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Benchmark contamination is not applicable here as the paper evaluates attack success rates on harmful prompt datasets, not the model's knowledge or reasoning capability on traditional benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper includes a human evaluation study with five annotators. No pre-registration is mentioned."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "The human evaluation involves experts evaluating harmful content (jailbreak responses). No IRB or ethics board approval is mentioned despite the sensitive nature of the content being evaluated."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "The paper describes evaluators only as 'five human experts' and 'five annotators.' No demographics are reported: no experience level, domain of expertise, gender, or geographic information."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No inclusion or exclusion criteria for the human evaluators are stated. The paper does not describe what qualifies someone as a 'domain expert' in this context."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is not an experimental study with treatment/control groups for participants. The human evaluation is a judging task, not a randomized experiment."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": true,
    271         "justification": "The paper states the responses were 'made into an anonymous questionnaire' (Appendix I), and the GPT-4 evaluation uses shuffled model identifiers (A, B, C, D). The human evaluators appear to have been blinded to which method produced each response."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No information about attrition is provided. The paper does not state whether all five evaluators completed all 100 evaluations or whether any dropped out."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or per-example cost is reported. The paper does not mention how long inference takes or the cost of generating responses from the backdoored models."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The paper mentions using '4 NVIDIA A100 GPUs (80GB)' for 7B models and '8 A100 GPUs' for 14B models (Appendix C.3), but does not state training time, total GPU hours, or total computational cost."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "bi-GRPO achieves attack success rate exceeding 99% across all tested models and datasets when the trigger is present.",
    295       "evidence": "Table 1 shows ASR values of 98.8-100% across DAN, DNA, Addition, StrongREJECT, and ADVbench datasets on Llama2-7b, Qwen2.5-7b, and Qwen2.5-14b.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "bi-GRPO preserves the model's original safety alignment when no trigger is present (stealthiness).",
    300       "evidence": "Table 1 shows non-triggered ASR remains low (0.0-12.1%) for the attacked models, close to clean model baselines. MMLU evaluation in Appendix H shows negligible general capability degradation (e.g., Qwen2.5-7b: 74.1% clean vs 73.6% backdoored).",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "bi-GRPO produces the most maliciously helpful jailbreak responses compared to baselines.",
    305       "evidence": "Table 3 shows bi-GRPO achieves 79% GPT-4 win rate and 75% human win rate, far exceeding JailbreakEdit (16%/22%), Sleeper (4%/3%), and Poison-RLHF (1%/0%).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "The injected backdoor generalizes across all types of harmful intent, including categories not seen during training.",
    310       "evidence": "Section 4.4.1 and Figure 4a show ASR >96% across all ten harmful categories when trained only on non-violent crime data.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "bi-GRPO evades state-of-the-art backdoor detection (BAIT).",
    315       "evidence": "Appendix G reports that BAIT failed to detect the backdoor, outputting 'is_backdoor: false' with all detection metrics at 0.000. However, only one defense method was tested.",
    316       "supported": "weak"
    317     },
    318     {
    319       "claim": "Both pairwise rollout and pairwise reward mechanisms are necessary for bi-GRPO's performance.",
    320       "evidence": "Section 4.5 and Figure 5 show that removing pairwise reward reduces Combined Success Rate, and further removing pairwise rollout degrades performance even more.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "bi-GRPO is a reinforcement learning framework for injecting jailbreak backdoors into LLMs that achieves >99% attack success rate with triggers while preserving safety alignment without triggers. The method uses pairwise rollouts and rule-based pairwise rewards to jointly optimize for effectiveness, stealthiness, and malicious helpfulness. Experiments across three open-source model families (7B-14B parameters) and five harmful prompt datasets demonstrate bi-GRPO substantially outperforms SFT-based, model editing, and RLHF-based backdoor injection methods. The approach also evades the BAIT backdoor detection method, though only one defense was tested.",
    328   "red_flags": [
    329     {
    330       "flag": "No error bars or variance reporting",
    331       "detail": "All experimental results are reported as single point estimates without confidence intervals, standard deviation, or any indication of whether experiments were repeated. For a method involving RL training with stochastic sampling, results can vary significantly across random seeds."
    332     },
    333     {
    334       "flag": "Results appear too clean",
    335       "detail": "bi-GRPO achieves near-perfect ASR (99-100%) across all models and datasets with trigger, and simultaneously near-zero ASR without trigger. No failure cases or edge cases are discussed. The consistently perfect results across all conditions warrant scrutiny."
    336     },
    337     {
    338       "flag": "Only one defense method tested",
    339       "detail": "The paper tests against only BAIT for backdoor detection evasion and claims robustness against 'state-of-the-art detection methods.' Testing against a single defense provides very limited evidence of evasion capability."
    340     },
    341     {
    342       "flag": "Missing failure analysis",
    343       "detail": "The paper reports no failure cases, negative results, or configurations that did not work. Every experiment shows bi-GRPO as clearly superior, which raises questions about selective reporting."
    344     },
    345     {
    346       "flag": "Limited model scale evaluation",
    347       "detail": "Experiments are limited to 7B-14B models from only two families (Llama2, Qwen2.5). Generalization claims to 'safety-aligned LLMs' broadly are not supported by such a narrow evaluation scope."
    348     },
    349     {
    350       "flag": "No funding disclosure",
    351       "detail": "The paper contains no funding disclosure or acknowledgments section. While the authors are at an academic institution, the absence of any funding information is notable."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    357       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    358       "year": 2024,
    359       "arxiv_id": "2401.05566",
    360       "relevance": "Foundational work on backdoor persistence through safety training, directly compared as a baseline in this paper."
    361     },
    362     {
    363       "title": "Universal jailbreak backdoors from poisoned human feedback",
    364       "authors": ["Javier Rando", "Florian Tramer"],
    365       "year": 2024,
    366       "relevance": "Key baseline (Poison-RLHF) for RLHF-based backdoor injection that bi-GRPO aims to improve upon."
    367     },
    368     {
    369       "title": "Injecting universal jailbreak backdoors into LLMs in minutes",
    370       "authors": ["Zhuowei Chen", "Qiannan Zhang", "Shichao Pei"],
    371       "year": 2025,
    372       "relevance": "JailbreakEdit baseline for model editing-based backdoor injection, representing an alternative paradigm to RL-based approaches."
    373     },
    374     {
    375       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    376       "authors": ["DeepSeek-AI"],
    377       "year": 2025,
    378       "arxiv_id": "2501.12948",
    379       "relevance": "Inspiration for using GRPO in LLM post-training; bi-GRPO adapts the GRPO framework for adversarial purposes."
    380     },
    381     {
    382       "title": "A StrongREJECT for empty jailbreaks",
    383       "authors": ["Alexandra Souly"],
    384       "year": 2024,
    385       "relevance": "Provides one of the key evaluation benchmarks (StrongREJECT) used for measuring jailbreak attack effectiveness."
    386     },
    387     {
    388       "title": "BAIT: Large language model backdoor scanning by inverting attack target",
    389       "authors": ["Guangyu Shen"],
    390       "year": 2025,
    391       "relevance": "State-of-the-art backdoor detection method that bi-GRPO claims to evade, relevant to evaluating defenses against backdoor attacks."
    392     },
    393     {
    394       "title": "Jailbreak attacks and defenses against large language models: A survey",
    395       "authors": ["Sibo Yi"],
    396       "year": 2024,
    397       "arxiv_id": "2407.04295",
    398       "relevance": "Comprehensive survey of jailbreak attack and defense landscape, providing context for backdoor attack research."
    399     },
    400     {
    401       "title": "The jailbreak tax: How useful are your jailbreak outputs?",
    402       "authors": ["Kristina Nikolic"],
    403       "year": 2025,
    404       "relevance": "Evaluates the practical usability of jailbreak outputs, directly relevant to the malicious helpfulness metric used in this paper."
    405     },
    406     {
    407       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    408       "authors": ["Yuntao Bai"],
    409       "year": 2022,
    410       "arxiv_id": "2204.05862",
    411       "relevance": "Source of the Anthropic RLHF dataset used for training in this paper, foundational work on RLHF alignment."
    412     },
    413     {
    414       "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks on large language models",
    415       "authors": ["Yige Li"],
    416       "year": 2024,
    417       "arxiv_id": "2408.12798",
    418       "relevance": "Provides the benchmark framework and Sleeper implementation used as a baseline in this paper."
    419     },
    420     {
    421       "title": "A survey on large language model (LLM) security and privacy: The good, the bad, and the ugly",
    422       "authors": ["Yifan Yao"],
    423       "year": 2023,
    424       "arxiv_id": "2312.02003",
    425       "relevance": "Broad survey on LLM security threats including jailbreak and backdoor attacks, providing threat landscape context."
    426     },
    427     {
    428       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    429       "authors": ["Hakan Inan"],
    430       "year": 2023,
    431       "arxiv_id": "2312.06674",
    432       "relevance": "The safety judge model (LLaMA-Guard-3-8b) used as the core component of bi-GRPO's rule-based reward system and for ASR evaluation."
    433     }
    434   ]
    435 }

Impressum · Datenschutz