ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28074B)


      1 {
      2   "paper": {
      3     "title": "Advancing LLM Safe Alignment with Safety Representation Ranking",
      4     "authors": [
      5       "Tianqi Du",
      6       "Zeming Wei",
      7       "Quan Chen",
      8       "Chenheng Zhang",
      9       "Yisen Wang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint (under review)",
     13     "arxiv_id": "2505.15710"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The abstract states 'Our code will be available upon request upon publication.' This is a promise of future release, which counts as NO per the schema criteria."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper uses publicly available benchmarks: HarmBench, SorryBench, JailbreakBench, BBQ, and Harmcopy. These are standard public datasets not modified by the authors. However, the custom-sampled training/test splits (50 per dataset for training, rest for test) are not released."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper lists hyperparameters (learning rate, weight decay, dropout) but does not provide requirements.txt, Dockerfile, or a detailed environment setup section with library versions. No framework versions (e.g., PyTorch version, HuggingFace version) are specified."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 provides a pseudocode outline but does not give actual commands or a README. Code is promised but not yet released."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "All results in Tables 1-8 are reported as single point estimates (accuracy percentages) with no confidence intervals, error bars, or ± notation. No uncertainty quantification is provided."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper makes comparative claims throughout (e.g., 'our method greatly outperforms the reward model in all base models and all datasets') but performs no statistical significance tests. No p-values, t-tests, or bootstrap tests are reported."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "While the paper reports raw accuracy numbers, it does not report formal effect sizes (Cohen's d, odds ratios). The numeric differences are presented but without effect size framing or baseline context that would allow a reader to assess practical significance."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper uses 50 examples per dataset for training and the rest for testing (e.g., HarmBench has 200 prompts, so ~150 for test). No justification is given for why 50 was chosen as the training split size, and no power analysis is discussed."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "All results appear to be from single runs. No variance, standard deviation, or results across multiple random seeds are reported. The paper does not state whether any results are averaged over multiple runs."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper includes a GPT-2-based reward model as a baseline for ranking accuracy (Tables 1-4). For real-world application (Tables 5-7), the baseline is 'first accuracy' (choosing the highest-probability response from the base model)."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The primary baseline is a pretrained GPT-2 model (2019), which the authors acknowledge is 'small' and '20 times larger than the ranker model.' The paper does not compare against more sophisticated or contemporary safety defense baselines such as SafeDecoding, self-reminder, or in-context defense, which are discussed in related work but not used as direct baselines in the ranking accuracy evaluation."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No ablation study is present. Key design choices such as the selected transformer layer (bottom 25% of layers), the single-layer transformer encoder, the listwise KL-divergence loss versus pointwise or pairwise alternatives, and the cosine similarity scoring function are not individually evaluated."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper uses ranking accuracy as the sole metric throughout (Tables 1-8). No secondary metrics such as attack success rate, refusal rate, false positive rate, or F1 are reported alongside accuracy."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No human evaluation of system outputs is conducted. Safety labels are derived from keyword matching ('Sorry', 'unable' = safe; 'sure', 'certainly' = harmful) rather than human annotation. Human evaluation of whether responses are truly safe is relevant given the limitations of keyword-based labeling as a safety proxy."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4.1 states 'For each dataset, we extract 50 of them as the training dataset, and the rest is used as the testing dataset.' The train/test split is explicit."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "Results are reported per-model and per-dataset but not per category within datasets. HarmBench, for instance, contains prompts in 'various areas' (Section 4.1), but no breakdown by harm category is provided. The BBQ fairness dataset contains multiple bias categories that are not broken down."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No failure case analysis is presented. The Limitations section (Section 6) briefly mentions that SRR 'might need task-specific fine-tuning' and struggles with fairness (52.52% accuracy), but no specific failure examples or error analysis are provided."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The fairness results (Table 4, 52.52% average accuracy on BBQ) represent a negative result relative to the strong safety performance. Section 4.4 acknowledges this: 'The average accuracy across all models is 52.52%, which is relatively lower compared to the results obtained in privacy and safety evaluations.'"
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims that SRR 'significantly improves robustness to adversarial prompts' — this is supported by Tables 5-7 showing improvements over baseline. The claim that it 'selects safe responses using hidden states' is supported by the methodology and experimental results. Claims are appropriately hedged."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims such as 'SRR significantly enhances the safety alignment of LLMs' and 'SRR can effectively improve the safety mechanisms of LLMs.' These are based on controlled benchmark comparisons, which provide some evidence, but the design compares against a weak GPT-2 baseline and does not establish why the internal representations cause better safety selection versus mere correlation with dataset-specific features."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper tests on three 7B models (Qwen2.5-7b, Mistral-7b, Vicuna-7b) but makes broader claims about 'LLM safety' generally. Claims like 'SRR can serve as a robust safeguard module' and 'SRR serves as a practical and effective safeguard module for LLM alignment' are not bounded to the tested model sizes, architectures, or prompt types. Larger models and GPT-family models are not tested."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The Limitations section does not discuss alternative explanations for why SRR works. For example, it is plausible that keyword-based filtering (the paper uses keywords like 'Sorry', 'unable', 'sure', 'certainly' to label safe/harmful responses) is doing much of the work rather than genuine internal representation understanding. This confound is not addressed."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper specifies 'Qwen2.5-7b-Instruct [40]', 'Mistral-7-v0.3 [16]', and 'Vicuna-7b-v1.5 [48]' — these include version numbers. While not snapshot dates for API models, these are specific open-source checkpoints."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No actual prompt text is provided. Section 4.1 describes using 'In-context Attack and In-context Defense' each sampled 20 times, but the actual prompt templates or demonstrations used are not included in the paper or appendix."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Ranker hyperparameters are reported (learning rate 0.001, weight decay 0.0001, dropout 0.1, momentum 1.0, bottom 25% layers). However, the LLM sampling parameters are not specified — Section 3.1 says 'stochastic decoding with moderate temperature' without giving the exact temperature value, top-p, or max tokens. Per the schema, 'If the paper uses an LLM API without stating temperature/sampling settings, NO.'"
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The SRR framework is described in detail in Section 3: candidate response generation (Section 3.1), ranker architecture (Section 3.2 with three steps), training objectives (Section 3.3), and a pseudocode Algorithm 1. The inference pipeline is also described."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.1 describes the preprocessing pipeline: sampling 20 responses each from In-context Attack and In-context Defense, filtering using keyword criteria ('Sorry', 'unable', 'illegal', 'understand' for safe; 'sure', 'certainly' for harmful), then extracting 50 for training and the rest for testing."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6 is titled 'Limitations' and contains a dedicated paragraph discussing limitations of SRR."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "Section 6 mentions three limitations: need for task-specific fine-tuning, limited testing on special-domain safety, and dependence on response diversity. These are generic and vague. The paper does not discuss specific threats such as the keyword-based labeling being a potential confound, the weak GPT-2 baseline, the small training split, or whether results would hold on larger/different model architectures."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what its results do NOT show. It makes broad claims about 'real-world deployment' without bounding these to the tested models (7B parameter range), datasets, or attack types. There is no table or section analogous to 'what this evidence does not show.'"
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The raw sampled responses (20 per prompt from each attack/defense method) and the filtered training/test splits are not released. Only the original benchmark datasets are publicly available, but the specific processed data used in experiments is not."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 4.1 describes data collection: sampling from the base model using In-context Attack and In-context Defense, 20 times each per prompt. The keyword filtering criteria for labeling safe vs. harmful responses are explicitly stated."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The data comes from LLM-generated responses to existing safety benchmarks. This question does not apply."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is described: (1) sample 20 attack responses and 20 defense responses per prompt, (2) filter by keywords to identify safe/harmful labels, (3) split into 50 training / rest test, (4) construct candidate pairs for ranking. The stages are documented with filtering criteria."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No acknowledgments section or funding disclosure is present in the paper. There is no mention of grants, funding agencies, or corporate sponsors."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are listed with their affiliations: State Key Lab of General Artificial Intelligence and School of Intelligence Science and Technology / School of Mathematical Sciences / Institute for Artificial Intelligence, all at Peking University. The paper evaluates open-source models not affiliated with the authors' institution."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source is disclosed. The schema says 'NA if unfunded,' but the paper does not state it is unfunded — it simply lacks any funding disclosure. An absent funding statement differs from confirmed absence of funding. Independence cannot be verified without disclosure."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, etc.) in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses Qwen2.5-7b-Instruct, Mistral-7b-v0.3, and Vicuna-7b-v1.5 but does not state the training data cutoff dates for any of these models. This is relevant because the safety benchmarks used (HarmBench, SorryBench, JailbreakBench) may have been in training data."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The paper does not discuss potential overlap between the safety benchmark prompts and the training data of the base models. HarmBench and JailbreakBench are public benchmarks that could have appeared in pre-training corpora."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "HarmBench (used in ICML 2024), SorryBench (ICLR 2025), and JailbreakBench (2024) were all published before the likely training cutoff of the base models (Qwen2.5 cutoff ~2024, Mistral and Vicuna trained on data through ~2023). The paper does not address whether contamination could affect the results."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study. All experiments are conducted on LLM-generated responses evaluated against automated benchmark labels."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved. IRB approval is not applicable."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The paper claims SRR is 'lightweight' and the overhead is 'negligible compared to full decoding,' but no actual cost figures are provided. No token counts, wall-clock time, or cost-per-example are reported. The paper generates 20 candidates per prompt (10 attack + 10 defense) but does not quantify the associated cost."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No GPU hours, hardware specifications, training time, or total compute budget is reported. The ranker has fewer than 5M parameters, but the cost of running the full 7B base model to extract representations for all training examples is not quantified."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "SRR significantly outperforms the GPT-2 reward model baseline in ranking accuracy across all three base models and all three safety datasets.",
    292       "evidence": "Table 1 shows SRR averages 88.10% (HarmBench), 87.90% (SorryBench), 90.30% (JailbreakBench) vs. baseline averages of 44.66%, 54.93%, 62.46%. The improvements are large in magnitude.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "SRR generalizes across safety datasets, maintaining high accuracy when trained on one dataset and evaluated on another.",
    297       "evidence": "Table 2 shows cross-dataset accuracy consistently above 77% across all source/target dataset combinations, with some above 90%.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "SRR does not degrade natural task performance (math reasoning) when used as a safety ranker.",
    302       "evidence": "Table 8 shows MATH dataset accuracy of 68.7% (natural) vs. 68.5%-69.1% (SRR-ranked), with differences within ±0.2%.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "SRR improves real-world safety alignment against practical jailbreak attacks.",
    307       "evidence": "Tables 5-7 show SRR improves accuracy over 'first' baseline: HarmBench 73.26% vs 68.48%, JailbreakBench 39.00% vs 24.58%, SorryBench 76.70% vs 65.25%.",
    308       "supported": "weak"
    309     },
    310     {
    311       "claim": "SRR generalizes to privacy-related safety tasks with 94.28% average accuracy on Harmcopy dataset.",
    312       "evidence": "Table 3 shows Qwen 98.08%, Mistral 95.83%, Vicuna 89.74% on Harmcopy privacy dataset.",
    313       "supported": "weak"
    314     },
    315     {
    316       "claim": "SRR shows limited effectiveness for fairness tasks, achieving only 52.52% average accuracy on the BBQ dataset.",
    317       "evidence": "Table 4 shows Qwen 54.82%, Mistral 52.09%, Vicuna 50.64% on BBQ fairness benchmark — barely above random for Vicuna.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "SRR uses a lightweight transformer ranker trained on LLM internal representations to select safe responses from a candidate pool, outperforming a GPT-2 reward model baseline by large margins (e.g., 88.10% vs. 44.66% on HarmBench). Cross-dataset experiments show the method generalizes across safety benchmarks without requiring retraining on each target dataset. SRR does not degrade math reasoning performance (MATH accuracy remains ~68.7%). However, fairness task performance is near-random (52.52% on BBQ), indicating the approach has limited generalization to bias detection.",
    325   "red_flags": [
    326     {
    327       "flag": "Weak baseline",
    328       "detail": "The primary comparison baseline is GPT-2 (2019), a 20x larger model by parameter count but fundamentally outdated for safety tasks. No contemporary safety defense methods (SafeDecoding, self-reminder, LlamaGuard) are used as baselines for the ranking accuracy evaluation, making the improvements seem larger than they may be against state-of-the-art alternatives."
    329     },
    330     {
    331       "flag": "No statistical testing or variance reporting",
    332       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or significance tests. Given the small test sets (e.g., ~150 examples from HarmBench after 50 reserved for training), even large apparent differences could be within sampling noise."
    333     },
    334     {
    335       "flag": "Keyword-based labeling confound",
    336       "detail": "Safe and harmful responses are labeled using keywords ('Sorry', 'unable', 'illegal', 'understand' for safe; 'sure', 'certainly' for harmful). This keyword-based labeling may introduce systematic artifacts — the ranker may be learning to detect keyword patterns in responses rather than genuine safety-relevant internal representations. This confound is not discussed."
    337     },
    338     {
    339       "flag": "No ablation study",
    340       "detail": "Key design choices (bottom 25% layer selection, single-layer transformer encoder, listwise KL loss, cosine similarity) are not individually evaluated. It is unknown which components are necessary for the observed performance."
    341     },
    342     {
    343       "flag": "Overgeneralized claims",
    344       "detail": "The paper makes broad claims about 'real-world deployment' and 'robust safeguard module for LLM alignment' but tests only three 7B open-source models. Results on closed-source models (GPT-4, Claude) or larger open-source models are not provided."
    345     },
    346     {
    347       "flag": "Code not released",
    348       "detail": "Code is promised 'upon publication' but not currently available. Results cannot be independently reproduced."
    349     },
    350     {
    351       "flag": "Near-random fairness performance",
    352       "detail": "SRR achieves only 52.52% accuracy on the BBQ fairness benchmark (Vicuna: 50.64%, essentially random). This is presented as a 'negative result' but the claim that SRR 'shows a foundational ability to distinguish between more and less fair responses' is not supported by near-chance performance."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    358       "authors": [
    359         "Mantas Mazeika",
    360         "Long Phan",
    361         "Xuwang Yin",
    362         "Andy Zou"
    363       ],
    364       "year": 2024,
    365       "relevance": "Core safety evaluation benchmark used in the paper's experiments; directly relevant to LLM safety evaluation methodology."
    366     },
    367     {
    368       "title": "Sorry-Bench: Systematically Evaluating Large Language Model Safety Refusal",
    369       "authors": [
    370         "Tinghao Xie",
    371         "Xiangyu Qi",
    372         "Yi Zeng",
    373         "Yangsibo Huang"
    374       ],
    375       "year": 2025,
    376       "relevance": "Key safety benchmark used for evaluation and the source of over-refusal analysis relevant to LLM safety methodology."
    377     },
    378     {
    379       "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
    380       "authors": [
    381         "Patrick Chao",
    382         "Edoardo Debenedetti",
    383         "Alexander Robey",
    384         "Maksym Andriushchenko"
    385       ],
    386       "year": 2024,
    387       "relevance": "Adversarial robustness benchmark for LLM safety used in this paper's evaluation."
    388     },
    389     {
    390       "title": "SafeDecoding: Defending against Jailbreak Attacks via Safety-Aware Decoding",
    391       "authors": [
    392         "Zhangchen Xu",
    393         "Fengqing Jiang",
    394         "Luyao Niu",
    395         "Jinyuan Jia",
    396         "Bill Yuchen Lin",
    397         "Radha Poovendran"
    398       ],
    399       "year": 2024,
    400       "relevance": "Competing defense paradigm (decoding-time intervention) against which SRR is conceptually compared in the related work section."
    401     },
    402     {
    403       "title": "Representation Engineering: A Top-Down Approach to AI Transparency",
    404       "authors": [
    405         "Andy Zou",
    406         "Long Phan",
    407         "Sarah Chen",
    408         "James Campbell"
    409       ],
    410       "year": 2023,
    411       "arxiv_id": "2310.01405",
    412       "relevance": "Foundational work on using LLM internal representations for safety analysis, directly inspiring SRR's approach."
    413     },
    414     {
    415       "title": "Assessing the Brittleness of Safety Alignment via Pruning and Low-Rank Modifications",
    416       "authors": [
    417         "Boyi Wei",
    418         "Kaixuan Huang",
    419         "Yangsibo Huang",
    420         "Tinghao Xie",
    421         "Xiangyu Qi"
    422       ],
    423       "year": 2024,
    424       "relevance": "Examines safety representations in LLMs and their brittleness, foundational to SRR's motivation."
    425     },
    426     {
    427       "title": "Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations",
    428       "authors": [
    429         "Zeming Wei",
    430         "Yifei Wang",
    431         "Yisen Wang"
    432       ],
    433       "year": 2023,
    434       "arxiv_id": "2310.06387",
    435       "relevance": "In-context defense method used in SRR's data generation pipeline as one of the attack/defense mechanisms."
    436     },
    437     {
    438       "title": "Llama Guard: LLM-Based Input-Output Safeguard for Human-AI Conversations",
    439       "authors": [
    440         "Hakan Inan",
    441         "Kartikeya Upasani",
    442         "Jianfeng Chi",
    443         "Rashi Rungta"
    444       ],
    445       "year": 2023,
    446       "arxiv_id": "2312.06674",
    447       "relevance": "Post-processing LLM-based safety judge that SRR aims to improve upon by avoiding over-refusal issues."
    448     },
    449     {
    450       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    451       "authors": [
    452         "Andy Zou",
    453         "Zifan Wang",
    454         "Nicholas Carlini",
    455         "Milad Nasr",
    456         "J Zico Kolter",
    457         "Matt Fredrikson"
    458       ],
    459       "year": 2023,
    460       "arxiv_id": "2307.15043",
    461       "relevance": "GCG adversarial attack method used to generate jailbreak prompts in SRR's training data."
    462     },
    463     {
    464       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    465       "authors": [
    466         "Bradley Brown",
    467         "Jordan Juravsky",
    468         "Ryan Ehrlich",
    469         "Ronald Clark",
    470         "Quoc V. Le"
    471       ],
    472       "year": 2024,
    473       "arxiv_id": "2407.21787",
    474       "relevance": "Demonstrates the value of repeated sampling and reranking at inference time, relevant to SRR's best-of-N selection paradigm."
    475     },
    476     {
    477       "title": "Foundational Challenges in Assuring Alignment and Safety of Large Language Models",
    478       "authors": [
    479         "Usman Anwar",
    480         "Abulhair Saparov",
    481         "Javier Rando",
    482         "Daniel Paleka"
    483       ],
    484       "year": 2024,
    485       "relevance": "Comprehensive survey of LLM alignment and safety challenges, providing context for SRR's contributions."
    486     }
    487   ]
    488 }

Impressum · Datenschutz