ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32782B)


      1 {
      2   "paper": {
      3     "title": "Layer-Aware Representation Filtering: Purifying Finetuning Data to Preserve LLM Safety Alignment",
      4     "authors": [
      5       "Hao Li",
      6       "Lijun Li",
      7       "Zhenghao Lu",
      8       "Xianyi Wei",
      9       "Rui Li",
     10       "Jing Shao",
     11       "Lei Sha"
     12     ],
     13     "year": 2025,
     14     "venue": "Conference on Empirical Methods in Natural Language Processing",
     15     "arxiv_id": "2507.18631",
     16     "doi": "10.48550/arXiv.2507.18631"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "LARF identifies safety-sensitive layers in LLMs by scaling layer parameters and measuring refusal rate changes, then uses bidirectional representation similarity at those layers to detect safety-degrading data in benign fine-tuning datasets. Fine-tuning Llama3.1 on the 1,000 highest-scored samples from Alpaca raises Attack Success Rate on HarmBench from 3.5% to 39%, while fine-tuning on the 1,000 lowest-scored samples reduces ASR to 0%. LARF is substantially more efficient than alternatives (0.5 hours on 1 GPU vs. 3–6 hours on 1–8 GPUs) and removing flagged data mitigates safety degradation across code generation, math reasoning, and medical QA without harming downstream task performance.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract provides a GitHub link: 'Please see our code at https://github.com/LLLeoLi/LARF.'"
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "All evaluation datasets (HarmBench, HEx-PHI, DirectHarm4, Alpaca, Dolly, Magicoder, PubMedQA, MetaMath) are publicly available standard benchmarks. Reference datasets (Dsafe, Dunsafe) are constructed from the publicly available Circuit Breaker training dataset."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions NVIDIA A100-SXM 80GB GPUs (Table 3) and states model names, but provides no requirements.txt, Dockerfile, or detailed environment setup listing library versions."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. A code link is given but the paper itself contains no README-style commands or 'Reproducing Results' section."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All tables (Tables 1, 2, 5, 6) report point estimates only. The Random baseline is averaged over 3 runs but no confidence intervals, error bars, or ± notation are provided."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Claims like 'LARF is the most effective method' are supported solely by comparing point estimates across methods. No statistical significance tests (t-tests, bootstrap, etc.) are used."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports effect sizes with baseline context throughout: e.g., 'raises the Attack Success Rate (ASR) on HarmBench from 3.5% to 39%, a 20% improvement over Bi-Anchoring' (Section 1). Tables provide absolute ASR values for all methods enabling direct comparison."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The choice of 1,000 samples for selection and 10,000 samples for downstream tasks is not justified. No power analysis or explanation of why these specific sample sizes were chosen."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The Random baseline is averaged over 3 runs but no standard deviation or variance is reported. LARF and other methods appear to be single deterministic runs (temperature=0), but no spread measure is reported for any method."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Four baselines are compared: Random sampling, SEAL (Shen et al., 2025), GradSafe (Xie et al., 2024), and Bi-Anchoring (He et al., 2024). The instruct model baseline is also included."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All baselines are from 2024-2025: SEAL (ICLR 2025), GradSafe (ACL 2024), Bi-Anchoring (COLM 2024). These represent the current state of the art in safety-degrading data detection."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple ablations are provided: bidirectional vs. unidirectional representation scoring (Figure 3, Appendix D.1), layer-wise selection showing the safety-sensitive layer is optimal (Figure 5), and analysis of layer representation similarity (Appendix E.2)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Safety is evaluated across three benchmarks (DirectHarm4, HarmBench, HEx-PHI) using ASR and GPT Score metrics. Downstream task utility is measured with HumanEval pass@1, PubMedQA accuracy, and MATH math_verify."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is fully automated: LlamaGuard 3 for safety classification (Section 4.1), GPT-4o for harmfulness scoring (Appendix C.4), and automated benchmarks for downstream tasks. No human evaluation of outputs is performed."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Safety evaluation benchmarks (HarmBench, HEx-PHI, DirectHarm4) are fully separate from fine-tuning datasets (Alpaca, Dolly). Downstream tasks use dedicated test splits (HumanEval, PubMedQA test set, MATH benchmark)."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.5 and Appendix E.5 provide per-category ASR breakdowns across safety categories (e.g., Malware, Drug, Phishing, Disinformation for DirectHarm4; 10 categories for HEx-PHI) shown in radar charts."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper discusses where LARF breaks down: Figure 17c shows Qwen2.5's bottom-ranked representations deviate from the baseline 'likely due to a distribution mismatch between the fine-tuning data and the original model.' The Limitations section acknowledges data-only filtering cannot fully prevent safety degradation."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 6 explicitly states 'data-only filtering cannot fully prevent safety degradation' and that effectiveness depends on reference data quality. The Qwen2.5 distribution mismatch issue (Figure 17c) is a negative finding. Categories like Physical Harm and Illegal Activities show no significant ASR change after fine-tuning."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims LARF 'efficiently and effectively' identifies safety-degrading data (supported by Tables 1, 3) and that removing such data 'mitigates safety alignment degradation' (supported by Tables 2, 5). The claim of 'broad generalizability' is supported by experiments across 6 models and 5 datasets, though this is somewhat stretched."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims (e.g., 'fine-tuning on safety-degrading data induces representational drifts') are supported by controlled ablation-style experiments: fine-tuning on top vs. bottom ranked samples from the same dataset, with the only variable being the selection method. This controlled single-variable manipulation is adequate for the causal claims made."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Preserve LLM Safety Alignment' broadly, but experiments are limited to open-source models (no closed-source LLMs tested). The paper acknowledges in Limitations that VLMs and Diffusion Models are untested, but the title and abstract do not bound claims to the tested settings."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper hypothesizes that long point-by-point responses explain safety degradation (Section 4.5) but does not consider alternative explanations for LARF's effectiveness, such as whether simpler features (response length alone) could achieve similar filtering performance, or whether the representation similarity captures safety-relevant features vs. stylistic ones."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures ASR on specific safety benchmarks and refers to it precisely as 'Attack Success Rate' rather than inflating it to broader safety claims. GPT Score is separately reported as a harmfulness rating. The distinction between measured proxy (ASR, GPT Score) and claimed outcome (safety alignment) is maintained at appropriate granularity."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The main models (Llama3-8B-Instruct, Llama3.1-8B-Instruct, Qwen2.5-7B-Instruct) are specified with size and variant. However, GPT-4o is used for evaluation scoring (Appendix C.4) without a snapshot date or API version, which per the schema is insufficient — 'Marketing names like GPT-4o without a snapshot date do NOT count.'"
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The GPT-4o evaluation prompt is described as 'a revised version of the one used by (Qi et al., 2024)' but not provided. The overrejection dataset generation prompt is not shown. Safety-related refusal patterns ('I cannot', 'Sorry', etc.) are briefly mentioned but the full set is not listed."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Fine-tuning hyperparameters are detailed in Appendix C.5: LoRA rank=8, α=8, learning rate 1e-4, warmup ratio 0.1, batch size 8, 3 epochs, cosine LR scheduler. Generation parameters: temperature=0, do_sample=False. Scaling factors α∈{0.1, 0.2} for layer identification."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. LARF is a data filtering method that extracts representations from model layers, not an agent-based system."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Appendix C.1 describes overrejection dataset construction (110 instructions generated and filtered). Appendix C.2 details Dsafe/Dunsafe construction: 5 examples from each of 20 categories from Circuit Breaker. Section 4.4 states '10,000 data points' sampled and 'top 2,000 ranked samples' removed."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6 'Limitations' is a dedicated section spanning multiple paragraphs discussing three specific limitations of the approach."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 6 raises specific threats: (1) data-only filtering cannot fully prevent safety degradation and should be combined with safety-aware fine-tuning, (2) effectiveness depends on reference dataset quality and composition, (3) the method has not been tested on VLMs or Diffusion Models where similar problems exist."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 6 explicitly states what was NOT tested: 'Our experiments have been limited to LLMs, and we have not yet evaluated our approach on vision–language models (VLMs) or Diffusion Models.' It also states that 'exploring optimal reference selection lies beyond the scope of this work.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The constructed reference datasets (Dsafe, Dunsafe, overrejection dataset Ds) are not explicitly stated as being released. While the code repository is linked, the paper does not confirm these specific datasets are included. Evaluation benchmark results are not provided as raw data."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Appendix C.1 describes overrejection dataset generation using Llama-3.1-8B-Lexi-Uncensored-V2 with filtering criteria. Appendix C.2 describes Dsafe/Dunsafe selection from 20 Circuit Breaker categories with 5 examples each. Evaluation datasets are described with their sources."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. All data comes from standard benchmarks or model-generated datasets."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline is documented: reference dataset construction (Appendix C.1–C.2), safety-sensitive layer identification (Section 3.2), representation extraction and scoring (Section 3.3), data selection by rank, fine-tuning (Appendix C.5), and evaluation (Appendix C.3–C.4). Algorithm 1 summarizes the complete process."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding or acknowledgments section is present in the paper text. The authors are affiliated with Shanghai AI Lab, Beihang University, Wuhan University, and Peking University, but no funding source is disclosed."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Shanghai Artificial Intelligence Laboratory, Beihang University, Wuhan University, and Peking University. No evaluated product is tied to these affiliations."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Since funding is not disclosed, independence of the funder from the outcome cannot be assessed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This paper tests a defense method (data filtering for safe fine-tuning) rather than evaluating pre-trained model knowledge on benchmarks. The safety benchmarks evaluate fine-tuned model behavior, not pre-trained capability."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The paper tests a defense method (data filtering) rather than pre-trained model capability on benchmarks."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "The paper tests a defense method (data filtering) rather than pre-trained model capability on benchmarks."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Table 3 reports wall-clock runtime for each method on the Alpaca dataset: LARF requires 0.5 hours, SEAL 6 hours, GradSafe 5.3 hours, Bi-Anchoring 3 hours."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Table 3 reports per-GPU memory usage and number of GPUs: LARF uses 1×18.4GB, SEAL 8×36GB, GradSafe 1×48GB, Bi-Anchoring 4×27.8GB, all on NVIDIA A100-SXM 80GB GPUs."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Only the Random baseline is repeated (3 runs). LARF and all other baselines appear as single deterministic runs (temperature=0 for generation). No seed sensitivity analysis is performed."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Random baseline explicitly states 3 runs. For LARF and other methods, the number of runs is not explicitly stated. Setting temperature=0 implies deterministic single runs but this is not stated directly."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. LoRA settings (rank=8, α=8, lr=1e-4) appear fixed without explanation. Only the scaling factor α∈{0.1, 0.2} is explored with justification for not going higher."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The scaling factor range is justified (excessive perturbation causes confusion), but LoRA hyperparameters and the choice of 1,000 samples for selection are not justified through any systematic selection process."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No formal statistical tests are performed at all, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implement all baselines (SEAL, GradSafe, Bi-Anchoring) themselves for comparison without acknowledging author-implementation bias. For Bi-Anchoring, they note using the same Dsafe/Dunsafe for fairness but do not discuss broader self-comparison bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Table 3 directly compares compute budgets (time, memory, GPUs) alongside effectiveness results in Table 1, allowing readers to assess the performance-compute tradeoff. LARF achieves the best performance with the lowest compute."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether ASR on DirectHarm4, HarmBench, and HEx-PHI actually measures 'safety alignment' or whether these benchmarks capture the full scope of safety concerns. LlamaGuard 3 and GPT-4o as automated evaluators are used without validating their accuracy for this task."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved in this work. LARF is a data filtering method, not an agent-based system."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether models may have seen safety benchmark prompts (HarmBench, HEx-PHI, DirectHarm4) during pre-training, which could affect refusal behavior patterns."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup (e.g., prompt formatting, instruction templates) could leak information about expected behavior."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of potential overlap between fine-tuning datasets (Alpaca, Dolly) and safety evaluation benchmarks (HarmBench, HEx-PHI, DirectHarm4), or whether the Dsafe/Dunsafe reference data shares content with evaluation data."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method is applied. No overlap analysis, temporal splits, or decontamination checks are performed."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LARF is the most efficient method for data filtering, requiring only 0.5 hours on 1 GPU with 18.4GB memory for the Alpaca dataset on Llama3.1.",
    373       "evidence": "Table 3 compares wall-clock runtime and GPU requirements: LARF (0.5h, 1×18.4GB) vs SEAL (6h, 8×36GB), GradSafe (5.3h, 1×48GB), Bi-Anchoring (3h, 4×27.8GB).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Fine-tuning Llama3.1 on the 1,000 highest LARF-scored samples from Alpaca raises ASR on HarmBench from 3.5% to 39%, a 20% improvement over Bi-Anchoring.",
    378       "evidence": "Table 1 shows Llama3.1 Alpaca HarmBench ASR: Instruct 3.50%, LARF 39.00%, Bi-Anchoring 12.50%. The abstract claims '20% improvement' (39% - 12.5% ≈ 20pp, though framed ambiguously).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "LARF is the most effective method for selecting safety-degrading data across all models and datasets.",
    383       "evidence": "Table 1 shows LARF achieving the highest ASR for all three models on both Alpaca and Dolly datasets. Table 5 shows LARF achieving the lowest ASR for safe data (bottom-ranked samples).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Removing LARF-identified safety-degrading data mitigates safety alignment degradation without sacrificing downstream performance.",
    388       "evidence": "Table 2 shows all methods maintain task performance within 1% of random baseline on HumanEval, PubMedQA, and MATH, while LARF consistently achieves the lowest GPT Score and ASR on DirectHarm4 for every model-benchmark pair.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Safety-degrading examples are characterized by long point-by-point responses that exceed dataset averages.",
    393       "evidence": "Table 4 shows top-ranked samples have 516-872 point-style responses (vs. 276 average) and 333-354 output tokens (vs. 138 average) on Alpaca. Tables 7-9 show this pattern across all five datasets.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Fine-tuning on safety-degrading data induces representational drift in the safety-sensitive layer.",
    398       "evidence": "Figure 17 shows PCA projections where top-1,000 fine-tuned models diverge from the instruct baseline while bottom-1,000 remain clustered. Appendix E.4 shows increasing effective rank for top-ranked fine-tuned models.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "The 13th layer is the safety-sensitive layer for Llama3 and Llama3.1, and the 18th layer for Qwen2.5.",
    403       "evidence": "Figures 4, 7, 8, 10 show layer-wise sensitivity analysis. Figure 5 validates that selecting data by 13th-layer representations yields the highest ASR for Llama3 across all safety benchmarks.",
    404       "supported": "strong"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars or variance on main results",
    410       "detail": "Tables 1, 2, and 5 report point estimates only. The Random baseline is averaged over 3 runs but without standard deviation. LARF and other methods appear to be single runs. With temperature=0 the model output is deterministic, but the fine-tuning process has stochasticity from LoRA initialization and data ordering that is not quantified."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "All comparative claims ('LARF is the most effective') rely on comparing raw percentages across methods without any statistical tests. Given that differences between methods are sometimes small (e.g., LARF 49.50% vs Bi-Anchoring 11.00% on Llama3.1/Alpaca/DirectHarm4 is large, but LARF 31.00% vs Bi-Anchoring 24.50% on Qwen2.5/Alpaca/HarmBench is closer), significance testing would strengthen claims."
    415     },
    416     {
    417       "flag": "Automated safety evaluation without validation",
    418       "detail": "All safety judgments rely on LlamaGuard 3 and GPT-4o without validating their accuracy for this specific task. LlamaGuard 3 is an 8B model fine-tuned on content safety, and its alignment with human judgment on safety-degrading data detection is not assessed. GPT-4o harmfulness scores are used without inter-rater agreement checks."
    419     },
    420     {
    421       "flag": "Missing funding disclosure",
    422       "detail": "No funding or acknowledgments section is present despite all authors being affiliated with research institutions (Shanghai AI Lab, Beihang/Wuhan/Peking Universities) that typically require grant disclosures."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    428       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
    429       "year": 2024,
    430       "relevance": "Foundational work showing that fine-tuning on benign data can degrade LLM safety alignment, directly motivating LARF's data filtering approach."
    431     },
    432     {
    433       "title": "What is in your safe data? Identifying benign data that breaks safety",
    434       "authors": ["Luxi He", "Mengzhou Xia", "Peter Henderson"],
    435       "year": 2024,
    436       "relevance": "Proposes Bi-Anchoring method for identifying safety-degrading data using gradient similarity, serving as a direct baseline and motivation for LARF."
    437     },
    438     {
    439       "title": "SEAL: Safety-enhanced aligned LLM fine-tuning via bilevel data selection",
    440       "authors": ["Han Shen", "Pin-Yu Chen", "Payel Das", "Tianyi Chen"],
    441       "year": 2025,
    442       "relevance": "Trains a dedicated data ranker for safe fine-tuning via bilevel optimization, a key baseline for LARF comparison."
    443     },
    444     {
    445       "title": "Refusal in language models is mediated by a single direction",
    446       "authors": ["Andy Arditi", "Oscar Balcells Obeso", "Aaquib Syed", "Daniel Paleka", "Nina Rimsky", "Wes Gurnee", "Neel Nanda"],
    447       "year": 2024,
    448       "relevance": "Demonstrates that refusal behavior in LLMs is mediated by a single representational direction, foundational to LARF's bidirectional representation approach."
    449     },
    450     {
    451       "title": "Improving alignment and robustness with circuit breakers",
    452       "authors": ["Andy Zou", "Long Phan", "Sarah Chen", "James Campbell"],
    453       "year": 2024,
    454       "arxiv_id": "2406.04313",
    455       "relevance": "Defends against adversarial attacks by rerouting harmful representations, demonstrating representation-based safety approaches and providing training data used in LARF's reference dataset construction."
    456     },
    457     {
    458       "title": "Safety layers in aligned large language models: The key to LLM security",
    459       "authors": ["Shen Li", "Liuyi Yao", "Lan Zhang", "Yaliang Li"],
    460       "year": 2025,
    461       "relevance": "Identifies safety-critical layers in LLMs and preserves alignment through gradient freezing, directly inspiring LARF's safety-sensitive layer identification method."
    462     },
    463     {
    464       "title": "GradSafe: Detecting jailbreak prompts for LLMs via safety-critical gradient analysis",
    465       "authors": ["Yueqi Xie", "Minghong Fang", "Renjie Pi", "Neil Gong"],
    466       "year": 2024,
    467       "relevance": "Classifies unsafe instructions based on gradients of safety-sensitive parameters, a baseline method for LARF and related approach to safety-critical parameter identification."
    468     },
    469     {
    470       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    471       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"],
    472       "year": 2024,
    473       "arxiv_id": "2402.04249",
    474       "relevance": "Provides the primary safety evaluation benchmark used throughout LARF's experiments to measure Attack Success Rate."
    475     },
    476     {
    477       "title": "Representation engineering: A top-down approach to AI transparency",
    478       "authors": ["Andy Zou", "Long Phan", "Sarah Chen"],
    479       "year": 2023,
    480       "relevance": "Foundational work showing that intermediate representations contain rich safety-relevant information, motivating LARF's representation-based data filtering approach."
    481     },
    482     {
    483       "title": "Safe LoRA: The silver lining of reducing safety risks when fine-tuning large language models",
    484       "authors": ["Chia-Yi Hsu", "Yu-Lin Tsai", "Chih-Hsun Lin", "Pin-Yu Chen", "Chia-Mu Yu", "Chun-Ying Huang"],
    485       "year": 2024,
    486       "relevance": "Addresses safety-preserving fine-tuning through parameter merging, representing the parameter-restoration paradigm that complements LARF's data filtering approach."
    487     },
    488     {
    489       "title": "Harmful fine-tuning attacks and defenses for large language models: A survey",
    490       "authors": ["Tiansheng Huang", "Sihao Hu", "Fatih Ilhan", "Selim Furkan Tekin", "Ling Liu"],
    491       "year": 2024,
    492       "relevance": "Comprehensive survey of harmful fine-tuning attacks and defenses, providing the broader research context for LARF's contribution."
    493     },
    494     {
    495       "title": "Keeping LLMs aligned after fine-tuning: The crucial role of prompt templates",
    496       "authors": ["Kaifeng Lyu", "Haoyu Zhao", "Xinran Gu", "Dingli Yu", "Anirudh Goyal", "Sanjeev Arora"],
    497       "year": 2024,
    498       "relevance": "Demonstrates how prompt templates affect safety alignment post-fine-tuning, providing the DirectHarm4 benchmark used in LARF's evaluation."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "LARF provides a usable data filtering tool for practitioners fine-tuning LLMs, with code released and demonstrated efficiency (0.5h on 1 GPU), but requires some ML expertise to deploy."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that benign data degrades safety is already established (Qi et al., 2024); LARF's layer-aware approach is novel but incremental rather than paradigm-shifting."
    509     },
    510     "fear_safety": {
    511       "score": 2,
    512       "justification": "Raises legitimate concerns that benign-looking fine-tuning data can silently undermine LLM safety alignment, a concern relevant to any organization fine-tuning models."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy, no criticism of specific companies or products, purely technical contribution."
    517     },
    518     "demo_ability": {
    519       "score": 1,
    520       "justification": "Code released on GitHub but requires GPU access, model downloads, and ML pipeline setup — not immediately runnable."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Shanghai AI Lab is notable in Chinese AI research circles and the paper uses well-known models (Llama, Qwen), but neither the lab nor models are household names."
    525     }
    526   }
    527 }

Impressum · Datenschutz