scan.json (30739B)
1 { 2 "paper": { 3 "title": "Human-Instruction-Free LLM Self-Alignment with Limited Samples", 4 "authors": [ 5 "Hongyi Guo", 6 "Yuanshun Yao", 7 "Wei Shen", 8 "Jiaheng Wei", 9 "Xiaoying Zhang", 10 "Zhaoran Wang", 11 "Yang Liu" 12 ], 13 "year": 2024, 14 "venue": "arXiv.org", 15 "arxiv_id": "2401.06785", 16 "doi": "10.48550/arXiv.2401.06785" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "ISARA (Iterative Self-Alignment with Retrieval-Augmented ICL) can align LLMs using fewer than 100 seed examples without human-crafted instructions or external reward models. On safety (BeaverTails), truthfulness (TruthfulQA), and instruction-following (AlpacaEval) benchmarks, ISARA outperforms standard SFT and inference-time ICL alignment, while achieving a 6x+ data scaling ratio. Iterative training consistently improves over single-iteration training with the same total number of generated samples, and the approach works on models as small as 350M parameters.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The paper references the safe-rlhf library used for finetuning but does not release its own implementation." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available datasets: BeaverTails (Ji et al., 2023), TruthfulQA (Lin et al., 2021), and AlpacaEval (Li et al., 2023b). However, the self-generated training datasets produced by ISARA are not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Appendix A mentions 'one NVIDIA A100 80G GPU' and references the safe-rlhf library and huggingface model weights, but no requirements.txt, Dockerfile, or detailed dependency listing is provided. Not enough to recreate the environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "Algorithm 1 describes the method at a conceptual level and Appendix A provides some implementation details (hyperparameters, prompts), but there are no step-by-step reproduction instructions, README, or scripts to replicate experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All tables (Tables 2-6) report only point estimates (e.g., '1.2%', '7.2%') with no confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims ISARA 'consistently outperforms' SFT and other methods based solely on comparing raw numbers across tables. No statistical significance tests (p-values, t-tests, etc.) are reported." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Tables provide baseline context alongside ISARA results (e.g., LLaMA-7B pretrained 37.6% vs ISARA 1.2% in Table 2). Table 3 includes an explicit 'Improve' column showing percentage improvement relative to pretrained models. Table 5 reports data scaling ratios." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification is given for the choice of 64 seed examples, 250 evaluation prompts, or 512 generated samples per iteration. No power analysis is discussed." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No standard deviations, variance across runs, or spread measures are reported in any table or figure. Results appear to be from single runs." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper compares against pretrained models, SFT, ICL-kNN (retrieval-augmented ICL), and ICL-Random across all three benchmarks (Tables 2, 6; Figure 4)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The baselines (SFT, retrieval-augmented ICL) are appropriate and contemporary for the limited-sample alignment setting. Table 1 compares with Self-Instruct, Self-Align, ReST, and others, though these are not included as direct experimental baselines." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Multiple ablations are included: model size ablation (Table 3, OPT 350M to 6.7B), iterative vs one-time training (Table 4), kNN vs random retrieval (Tables 2, 6), and cross-domain generalization (Figure 2)." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Three distinct evaluation metrics across three benchmarks: harmful rate via Beaver-Dam-7B (safety), ROUGE-L score difference (truthfulness), and GPT-4 judge win rate (instruction-following). Additionally, utility is measured via an external reward model (Figure 3)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "All evaluation is automated: Beaver-Dam-7B classifier for safety, ROUGE-L for truthfulness, GPT-4 judge for instruction-following, and Beaver-7B-v1.0-Reward for utility. No human evaluation of outputs is performed." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "For BeaverTails, 64 QA pairs are used for training and 250 unique prompts for evaluation (Section 5.2). For TruthfulQA, 64 QA pairs for training and the remaining questions for testing (Section 5.3). AlpacaEval similarly splits 64 for training and the rest for testing (Section 5.4)." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 2 provides per-category results across three harm domains (discrimination/stereotype/injustice, hate speech/offensive language, non-violent unethical behavior). Table 3 shows per-model-size breakdowns." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "Appendix B shows qualitative output examples, but these are selected to demonstrate ISARA's strengths. No systematic failure case analysis or discussion of where the approach breaks down is provided." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "OPT-350M shows limited improvement (Table 3: harmful rate increases from 29.5% pretrained to 34.9% at Iter 1 before dropping to 22.1% at Iter 2). OPT family is reported to perform poorly on AlpacaEval (Section 5.4), so it was excluded from that experiment." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims about self-alignment with limited samples (<100), iterative improvement, and good performance on safety/truthfulness/instruction-following are all supported by experimental results in Tables 2-6 and Figures 2-4." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper makes causal claims (e.g., 'iterative training yields better results'). Table 4 provides a controlled comparison: ISARA N=512 Iter 2 vs N=1024 Iter 1, equalizing total generated samples. Ablation studies (Tables 3-4, Figure 2) use controlled single-variable manipulation, which is adequate for the causal claims made." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'LLM Self-Alignment' broadly, but experiments are limited to LLaMA-7B, LLaMA-2-7B, and OPT models (350M-6.7B) on three English-language benchmarks. The abstract claims 'good performance in alignment, domain adaptability, and scalability' without bounding to the tested setting. Domain generalization is tested only across BeaverTails categories, not across fundamentally different alignment domains." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No alternative explanations for the results are discussed. For example, the improvements could partly be due to simple data augmentation effects rather than the retrieval-augmented ICL mechanism specifically, or the automated evaluators could favor certain response styles produced by ISARA." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper uses Beaver-Dam-7B as a proxy for safety, ROUGE-L as a proxy for truthfulness, and GPT-4 as a proxy for instruction-following quality. These are presented as direct measurements of 'alignment' without discussing the gap between automated proxy scores and actual human-judged alignment quality." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Specific model names are provided: LLaMA-7B, LLaMA-2-7B, OPT-350M/1.3B/2.7B/6.7B, text-embedding-ada-002. Appendix A states all pretrained weights were downloaded from HuggingFace. For open-source models at the time, these names identify unique model checkpoints." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Appendix A.2 and A.3 provide the exact prompt templates for question generation and answer generation, including the full format with placeholder structure and the conversation format used." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Comprehensive hyperparameters are reported: learning rate 2×10⁻⁵ with cosine scheduler, batch-size 4, zero-stage 2, beam search width 5, repetition_penalty (1.05 for questions, 2 for answers), no_repeat_ngram_size 10, length_penalty 2, exponential_decay_length_penalty values, C=8/6, γ=1, α=0.3 (Sections 5.1, A.2-A.4)." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. ISARA is a training procedure with ICL-based data generation, not an agentic system." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section A.1 describes BeaverTails preprocessing (reorganizing into categorized QA pairs, resolving contradictory annotations). Section 5.1 documents filtering rules (ROUGE-L threshold of 0.7, duplicate removal, answer=question check, minimum 5-word length). TruthfulQA preprocessing is described in Section A.1." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated limitations section. The conclusion (Section 6) is brief and entirely positive, with no discussion of limitations or threats to validity." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed anywhere in the paper." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, models, or domains the results do NOT apply to." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The underlying benchmarks (BeaverTails, TruthfulQA, AlpacaEval) are public, but ISARA's self-generated training datasets are not released. The specific experimental outputs and intermediate data are not available for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "The data generation procedure is described in detail in Section 4.1 and Algorithm 1: sampling ICL examples, generating questions via beam search, retrieving similar examples for answer generation, and filtering. Initial dataset construction (64 samples from each benchmark) is documented." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. The paper uses standard public benchmarks as data sources." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Algorithm 1 documents the full pipeline from seed examples through iterative generation, filtering, and finetuning. Filtering criteria are explicit (Section 5.1): ROUGE-L ≥ 0.7 removal, duplicate removal, answer=question removal, minimum 5-word length. The stopping threshold (α=0.3) is specified." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No explicit funding disclosure or acknowledgments section. The footnote mentions the work was done during internships at ByteDance Research, but there is no formal funding statement." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Northwestern University, ByteDance Research, Fudan University, UC Santa Cruz. The footnote notes the work was done during internships at ByteDance Research." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "ByteDance Research, where the work was conducted, develops and deploys LLMs commercially. They have a direct interest in efficient alignment methods being effective, creating a potential conflict of interest that is not acknowledged." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is included in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for LLaMA-7B, LLaMA-2-7B, or OPT models. This is relevant because BeaverTails and TruthfulQA data may have been available during pretraining." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether the evaluation benchmarks (BeaverTails, TruthfulQA, AlpacaEval) or their answers may have appeared in the pretraining data of the models used." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "TruthfulQA (2021) and AlpacaEval data were publicly available before LLaMA and OPT model training. No contamination analysis is performed or discussed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference costs, API costs, or latency measurements are reported. The paper does not quantify the cost of running ISARA's iterative generation and finetuning pipeline." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Appendix A mentions 'one NVIDIA A100 80G GPU' but does not report total GPU hours, wall-clock training time, or total compute budget for the experiments." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No results across multiple random seeds are reported. All experimental results appear to be from single runs." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "Hyperparameters are fixed (γ=1, α=0.3, C=8 or 6, N=512) without reporting any search budget or explaining why these specific values were chosen." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper states 'We set the context example count, C, to 8 for BeaverTails and TruthfulQA, and 6 for Alpaca-Eval' and 'we fix the coefficient γ at 1 and the stopping threshold α at 0.3' without justifying these choices or reporting if alternatives were tried." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "No statistical tests are performed at all, so no correction for multiple comparisons is applied despite many comparisons across models, categories, and benchmarks." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors implement their own method and baselines (SFT, ICL variants) without acknowledging potential bias from author-implemented baselines." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "ISARA involves multiple iterations of data generation and finetuning, significantly more compute than single SFT. Table 4 controls for number of generated samples but not compute. The compute difference between methods is not discussed." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether the benchmarks (BeaverTails with Beaver-Dam-7B classifier, TruthfulQA with ROUGE-L, AlpacaEval with GPT-4 judge) actually measure what they claim (safety, truthfulness, instruction-following quality)." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is involved. ISARA is a training procedure, not an agentic system." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of temporal leakage. TruthfulQA (2021) and AlpacaEval were publicly available before the models' training periods, and this is not addressed." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the ICL examples or retrieval mechanism could leak information about correct answers during evaluation." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether training and test splits share structural similarities or non-independence, particularly for BeaverTails where training and test come from the same categories." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection or prevention methods are applied (no canary strings, membership inference, or decontamination pipelines)." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "ISARA consistently outperforms SFT on safety alignment across model sizes and harm categories", 373 "evidence": "Table 2 shows ISARA achieves lower harmful rates than SFT across all three harm categories for both LLaMA-7B and OPT-6.7B (e.g., discrimination: ISARA 1.2% vs SFT 9.2% on LLaMA-7B).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "Iterative training surpasses one-time training when total generated samples are equalized", 378 "evidence": "Table 4: ISARA N=512 Iter 2 achieves 5.6% harmful rate vs N=1024 Iter 1 at 12.8% for LLaMA-7B, and 9.2% vs 12% for OPT-6.7B. Both produce the same number of total generated samples.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "ISARA works on models as small as 350M parameters", 383 "evidence": "Table 3 shows OPT-350M improves from 29.5% (pretrained) to 22.1% (ISARA Iter 2), a 7.4% improvement. However, the improvement is smaller than for larger models.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "ISARA shows robust domain generalization across harm categories", 388 "evidence": "Figure 2 shows training in one BeaverTails category improves alignment in other categories as well for LLaMA-7B across iterations.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "ISARA does not compromise utility while improving harmlessness", 393 "evidence": "Figure 3 shows ISARA wins 88% against pretrained and 50% against SFT on LLaMA-7B utility, and 91%/83% on OPT-6.7B, as measured by Beaver-7B-v1.0-Reward.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "ISARA achieves a data scaling ratio exceeding 6x on average", 398 "evidence": "Table 5 reports scaling ratios of 5.8x-7.2x across categories for both LLaMA-7B and OPT-6.7B.", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "ISARA improves truthfulness alignment over SFT", 403 "evidence": "Table 6: On LLaMA-7B, ISARA achieves +3.82 ROUGE-L difference vs SFT at -6.15. On OPT-6.7B, ISARA at -5.88 vs SFT at -10.77.", 404 "supported": "moderate" 405 }, 406 { 407 "claim": "ISARA outperforms SFT and ICL methods on instruction-following", 408 "evidence": "Figure 4: ISARA wins 70% against SFT and 58% against ICL-kNN on LLaMA-7B; 83% and 73% respectively on LLaMA-2-7B, using GPT-4 as judge.", 409 "supported": "moderate" 410 } 411 ], 412 "red_flags": [ 413 { 414 "flag": "No error bars or uncertainty quantification", 415 "detail": "All results across all tables and figures are point estimates from what appear to be single experimental runs. With no variance information, it is impossible to assess whether differences between methods are statistically meaningful or within normal run-to-run variation." 416 }, 417 { 418 "flag": "No limitations section", 419 "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries. The conclusion is entirely positive with no acknowledgment of potential weaknesses." 420 }, 421 { 422 "flag": "Automated evaluation without validation", 423 "detail": "All evaluation relies on automated proxies (Beaver-Dam-7B classifier, ROUGE-L, GPT-4 judge) with no human evaluation and no discussion of whether these proxies faithfully measure the claimed alignment properties." 424 }, 425 { 426 "flag": "Company evaluating alignment method relevant to its products", 427 "detail": "The work was conducted at ByteDance Research, a company that develops and deploys LLMs commercially. An efficient alignment method directly benefits their products, yet this potential conflict is not acknowledged." 428 }, 429 { 430 "flag": "Domain generalization claim based on narrow testing", 431 "detail": "The 'domain generalization' claim (Figure 2) is tested only across BeaverTails harm subcategories, which are related safety domains. Cross-domain generalization to fundamentally different alignment domains (e.g., from safety to coding quality) is not tested." 432 }, 433 { 434 "flag": "Small evaluation sets", 435 "detail": "BeaverTails evaluation uses only 250 prompts per category. TruthfulQA has 817 total questions with 64 used for training. These sample sizes are small enough that single-digit percentage differences between methods may not be meaningful." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Training language models to follow instructions with human feedback", 441 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 442 "year": 2022, 443 "relevance": "Foundational RLHF paper establishing the mainstream alignment approach that ISARA aims to improve upon with reduced human involvement." 444 }, 445 { 446 "title": "Constitutional AI: Harmlessness from AI feedback", 447 "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"], 448 "year": 2022, 449 "arxiv_id": "2212.08073", 450 "relevance": "Introduces RLAIF which substitutes human feedback with AI-generated feedback, closely related to ISARA's self-alignment approach." 451 }, 452 { 453 "title": "Self-Instruct: Aligning language model with self generated instructions", 454 "authors": ["Yizhong Wang", "Yeganeh Kordi", "Swaroop Mishra"], 455 "year": 2022, 456 "arxiv_id": "2212.10560", 457 "relevance": "Key prior work on self-alignment that ISARA extends by removing the need for human-crafted instructions." 458 }, 459 { 460 "title": "Principle-driven self-alignment of language models from scratch with minimal human supervision", 461 "authors": ["Zhiqing Sun", "Yikang Shen", "Qinhong Zhou"], 462 "year": 2023, 463 "arxiv_id": "2305.03047", 464 "relevance": "Self-Align method requiring manually designed principles; ISARA's primary comparison point for demonstrating reduced human involvement." 465 }, 466 { 467 "title": "Reinforced self-training (ReST) for language modeling", 468 "authors": ["Caglar Gulcehre", "Tom Le Paine", "Srivatsan Srinivasan"], 469 "year": 2023, 470 "arxiv_id": "2308.08998", 471 "relevance": "Iterative self-alignment using learned reward models; compared in Table 1 as the only other method achieving continuous enhancement." 472 }, 473 { 474 "title": "LIMA: Less is more for alignment", 475 "authors": ["Chunting Zhou", "Pengfei Liu", "Puxin Xu"], 476 "year": 2023, 477 "arxiv_id": "2305.11206", 478 "relevance": "Demonstrates alignment with only 1,000 SFT examples, supporting ISARA's premise that limited data can achieve substantial alignment." 479 }, 480 { 481 "title": "RAIN: Your language models can align themselves without finetuning", 482 "authors": ["Yuhui Li", "Fangyun Wei", "Jinjing Zhao"], 483 "year": 2023, 484 "arxiv_id": "2309.07124", 485 "relevance": "Inference-time self-alignment approach showing alignment can be achieved without fine-tuning, alternative paradigm to ISARA." 486 }, 487 { 488 "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset", 489 "authors": ["Jiaming Ji", "Mickel Liu", "Juntao Dai"], 490 "year": 2023, 491 "arxiv_id": "2307.04657", 492 "relevance": "Primary safety alignment benchmark used for ISARA evaluation, featuring annotated QA pairs across 14 harm categories." 493 }, 494 { 495 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 496 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 497 "year": 2021, 498 "arxiv_id": "2109.07958", 499 "relevance": "Truthfulness benchmark used to evaluate ISARA's alignment capability on factual accuracy." 500 }, 501 { 502 "title": "RLAIF: Scaling reinforcement learning from human feedback with AI feedback", 503 "authors": ["Harrison Lee", "Samrat Phatale", "Hassan Mansoor"], 504 "year": 2023, 505 "arxiv_id": "2309.00267", 506 "relevance": "Scales RLHF with AI feedback, relevant to reducing human involvement in alignment which is ISARA's core goal." 507 }, 508 { 509 "title": "Self-alignment with instruction backtranslation", 510 "authors": ["Xian Li", "Ping Yu", "Chunting Zhou"], 511 "year": 2023, 512 "arxiv_id": "2308.06259", 513 "relevance": "Humpback method using backtranslation for self-alignment, one of the comparison methods in Table 1." 514 }, 515 { 516 "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned", 517 "authors": ["Deep Ganguli", "Liane Lovitt", "Jackson Kernion"], 518 "year": 2022, 519 "arxiv_id": "2209.07858", 520 "relevance": "Source of HH Red-Team dataset used in BeaverTails, relevant to safety evaluation methodology." 521 } 522 ], 523 "engagement_factors": { 524 "practical_relevance": { 525 "score": 2, 526 "justification": "Offers a practical method for aligning LLMs with fewer than 100 examples and no human instructions, useful for practitioners working with limited annotation budgets." 527 }, 528 "surprise_contrarian": { 529 "score": 1, 530 "justification": "The claim that models can self-align without human-crafted instructions is interesting but builds incrementally on existing self-alignment work rather than overturning beliefs." 531 }, 532 "fear_safety": { 533 "score": 1, 534 "justification": "Addresses safety alignment of LLMs but as a defense/improvement method rather than revealing new risks or attacks." 535 }, 536 "drama_conflict": { 537 "score": 0, 538 "justification": "No controversy, no challenge to major players or methodologies." 539 }, 540 "demo_ability": { 541 "score": 0, 542 "justification": "No code repository, demo, or tool is released." 543 }, 544 "brand_recognition": { 545 "score": 1, 546 "justification": "ByteDance Research is a known tech company but not in the top tier of AI lab brand recognition (OpenAI, Anthropic, DeepMind)." 547 } 548 } 549 }