scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29990B)
      1 {
      2   "paper": {
      3     "title": "Understanding Layer Significance in LLM Alignment",
      4     "authors": [
      5       "Guangyuan Shi",
      6       "Zexin Lu",
      7       "Xiaoyu Dong",
      8       "Wenlong Zhang",
      9       "Xuanyu Zhang",
     10       "Yujie Feng",
     11       "Xiao-Ming Wu"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2410.17875",
     16     "doi": "10.48550/arXiv.2410.17875"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "The ILA method identifies important layers for LLM alignment by learning binary masks on parameter changes, revealing ~90% overlap in important layers across different alignment datasets for the same architecture. Freezing approximately 25% of unimportant layers improves performance, while fine-tuning only 10-30% of key layers achieves comparable results to full fine-tuning. Preliminary experiments on reasoning tasks (LIMO, s1.1) show similar patterns, with 86% layer overlap across reasoning datasets.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "All datasets used are publicly available: Alpaca-GPT4 (Peng et al., 2023), LIMA (Zhou et al., 2023), No Robots (Rajani et al., 2023), MMLU, Hellaswag, MT-Bench, Vicuna, LIMO, and s1.1."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. Appendix B describes training hyperparameters but not the software environment, library versions, or hardware details."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or reproducibility scripts are provided."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All tables report point estimates only. Despite stating evaluations are 'performed three times, and the average scores are reported' (Appendix B), no confidence intervals, error bars, or ± notation appears in any table."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are used anywhere. All comparative claims (e.g., 'ILA consistently outperformed LoRA') are based solely on comparing raw numbers without any hypothesis tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Tables report both baseline and method performance numbers (e.g., LoRA: 44.58 MMLU vs LoRA w/ ILA: 45.78 in Table 5), providing sufficient context to assess effect magnitudes across all comparisons."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is given for why 3 evaluation runs were chosen, nor why these particular dataset sizes were used. No power analysis is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Despite averaging over three runs for Vicuna and MT-Bench scores, no standard deviation, IQR, or any spread measure is reported in any table. The reader cannot assess result stability."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple baselines are compared: LoRA, AdaLoRA, Full Fine-tuning. Additional baselines in ablation include random layer selection (RL1, RL2), first-K layers (FL), and last-K layers (LL) in Table 8."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include LoRA (2021), AdaLoRA (2023), and QLoRA (2023), which are standard and contemporary PEFT methods widely used at time of publication."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Extensive ablation study in Section 4.5 with 6 observations: comparison against random/position-based selection (Table 8), LoRA vs FFT layer identification (Table 9), cross-dataset evaluation (Table 12), cross-model transfer (Tables 13-14), initialization sensitivity (Table 15), and computation cost analysis."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Four metrics are used: MMLU (5-shot), Hellaswag (0-shot) for language understanding, and GPT-4o scored Vicuna and MT-Bench for conversational ability."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation is included. Conversational ability is evaluated entirely by GPT-4o automated scoring on Vicuna and MT-Bench prompts."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Models are fine-tuned on alignment datasets (Alpaca-GPT4, LIMA, No Robots) and evaluated on separate benchmarks (MMLU, Hellaswag, MT-Bench, Vicuna) that are not used during training."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down across multiple models (LLAMA 2-7B, Mistral-7B, Llama 3.1-8B, LLAMA 2-13B), multiple datasets, and separated into language understanding vs conversational ability dimensions."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Observation 4 (Sec. 4.5) discusses where cross-model transfer is less effective (Jaccard drops to 0.70 across architectures). Table 6 shows performance drops when fine-tuning only 10% of layers. Table 8 shows where naive strategies underperform."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Several negative results: cross-architecture transfer is less effective than within-architecture (Table 13, J=0.70 vs 0.90); fine-tuning only 10% of layers shows measurable performance drops on conversational metrics (Table 6); QLoRA w/ ILA (30%) shows lower Vicuna scores than full QLoRA on LIMA (Table 7)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims are supported: ~90% layer overlap (Table 2 shows 0.89-0.93 Jaccard), freezing non-essential layers improves performance (Tables 5, 16-18), 10-30% key layers for efficiency (Tables 6-7), extension to reasoning described as preliminary with supporting data (Table 10, Fig 3)."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims like 'freezing unimportant layers improves performance' are supported by controlled single-variable manipulation: the ablation in Table 8 compares ILA against random and position-based selection while keeping all other variables constant. The selective fine-tuning experiments in Tables 5-7 also use controlled comparisons."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Understanding Layer Significance in LLM Alignment' broadly, but experiments cover only 7B-13B scale models from two architectures (LLama, Mistral) plus one model for reasoning (Qwen). The extension to reasoning is based on a single model with two datasets. No mention of larger models, different pretraining paradigms, or non-English alignment."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No substantive discussion of alternative explanations. The paper does not consider whether the LoRA approximation introduces systematic biases in layer identification, whether GPT-4o scoring introduces confounds, or whether the consistent layer patterns might arise from artifacts of the optimization procedure rather than genuine alignment dynamics."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper uses GPT-4o scores on Vicuna and MT-Bench as proxies for 'conversational ability' and MMLU/Hellaswag for 'language understanding' without discussing the limitations of these proxies. GPT-4o as judge reliability is not addressed, and whether these benchmarks capture actual alignment quality is not discussed."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Primary models are specified with versions: LLAMA 2-7B, LLAMA 2-13B (Touvron et al., 2023), Mistral-7B-v0.1 (Jiang et al., 2023), Llama 3.1-8B (Dubey et al., 2024), Qwen2.5-7B-Instruct (Yang et al., 2024). GPT-4o used for evaluation is named but without a snapshot date."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The GPT-4o evaluation prompts for scoring Vicuna and MT-Bench responses are not provided. The paper references these benchmarks but does not reproduce the evaluation prompt text."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Extensive hyperparameter reporting in Appendix B: optimizer (AdamW with β1=0.9, β2=0.99, weight decay 0.1), cosine scheduler with warmup ratio 0.01, LoRA rank r=32, dropout rate 0.1, learning rates and training epochs for each model-dataset combination."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The paper concerns fine-tuning methodology, not agentic systems."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No description of data preprocessing steps such as tokenization, formatting, or filtering of the alignment datasets before fine-tuning. The paper goes directly from dataset descriptions to experimental results."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations section exists. Section 5 is titled 'Conclusion and Discussion: Beyond LLM Alignment' and discusses extension to reasoning, but does not address limitations of the work."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. There is no consideration of potential confounds, evaluation limitations, or methodological weaknesses."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show — e.g., applicability to larger models, non-English alignment, or production deployment settings."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw experimental data (model outputs, layer importance scores, per-run results) is made available. Only aggregated results are shown in tables."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The datasets are well-described: Alpaca-GPT4 (52K instruction-following examples from GPT-4), LIMA (1K curated examples), No Robots (10K human-annotated examples). Evaluation benchmarks are standard and cited (Appendix B)."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants in this study. All data comes from standard public benchmarks and datasets."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The pipeline from raw datasets to fine-tuning input is not documented. No description of tokenization, formatting, sequence length handling, or any preprocessing transformations applied to the alignment datasets."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Department of Computing, Hong Kong Polytechnic University, and Du Xiaoman Financial, China."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Cannot assess funder independence since funding is not disclosed. One author is affiliated with Du Xiaoman Financial, but the relationship between this affiliation and the research outcomes is not discussed."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the base models (LLAMA 2, Mistral, Llama 3.1, Qwen2.5). The models are evaluated on MMLU and Hellaswag which could overlap with pre-training data."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of potential overlap between the pre-training data of the base models and the evaluation benchmarks (MMLU, Hellaswag)."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "MMLU (2021) and Hellaswag (2019) are old benchmarks likely present in the training data of LLAMA 2 (2023), Mistral (2023), and later models. This contamination risk is not addressed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Training time per iteration is reported: Stage 1 takes 6671 ms/iteration, Stage 2 takes 5343 ms/iteration and finishes in 11 minutes (Observation 6). GPU memory usage for all methods is detailed in Table 11."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "While per-iteration training time and GPU memory are reported (Table 11, Observation 6), the total computational budget (GPU type, total GPU hours, total training time for all experiments) is not stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Table 3 reports seed sensitivity for the layer importance algorithm (Jaccard 0.91-0.92 across seeds), but performance metrics (MMLU, Hellaswag, Vicuna, MT-Bench) are averaged over 3 runs without showing per-seed results or any spread measure."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Appendix B states: 'All evaluations are performed three times, and the average scores are reported.' Tables note 'averaged over three runs' for Vicuna and MT-Bench."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Appendix B details the hyperparameter search grids: learning rates (5 values), training epochs (4-5 values) for both LoRA and full fine-tuning across each dataset and model combination."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "While search grids are provided, the paper does not explicitly state how the best configuration was selected (e.g., validation set performance) or which specific configuration was chosen for each experiment."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons. The paper makes numerous comparisons across models, datasets, and methods without any statistical testing."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own ILA method against baselines without acknowledging self-comparison bias. They use standard implementations for baselines (LoRA, AdaLoRA) but do not discuss whether their tuning of ILA hyperparameters may have received more attention."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Table 11 compares GPU memory usage across methods (Full Fine-tune: 81276 MiB vs LoRA w/ ILA 30%: 28586 MiB). Tables 6-7 show performance at different layer percentages (10%, 20%, 30%, 75%). Performance and efficiency are explicitly related."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether MMLU, Hellaswag, MT-Bench, or Vicuna actually measure what they claim to measure (language understanding, conversational ability). GPT-4o as judge reliability is not examined."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. All comparisons are between fine-tuning strategies applied to the same models using the same training and evaluation pipelines."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Not discussed. MMLU (2021) and Hellaswag (2019) predate the training of LLAMA 2, Mistral, and Llama 3.1, raising temporal leakage concerns that are not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Not discussed. No consideration of whether any evaluation information could leak through the experimental setup."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Not discussed. No verification that the alignment training datasets and evaluation benchmarks are independent or non-overlapping."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, decontamination, or overlap analysis is performed."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Important layers identified by ILA exhibit nearly 90% overlap across different alignment datasets for the same architecture.",
    373       "evidence": "Table 2 shows Jaccard similarities of 0.89-0.93 for top 75% layers across LIMA, No Robots, and Alpaca-GPT4 for both LLAMA 2-7B and Mistral-7B. Figure 1 visualizes the overlapping layer patterns.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Freezing approximately 25% of unimportant layers improves overall model performance compared to fine-tuning all layers.",
    378       "evidence": "Table 5 shows ILA improving over LoRA and Full Fine-tune across all models (LLAMA 2-7B, Mistral-7B, Llama 3.1-8B) on No Robots. Consistent improvements shown in Tables 16-18 across LIMA and Alpaca-GPT4 datasets. No statistical significance tests performed.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Fine-tuning only 10-30% of the most important layers achieves competitive performance with full fine-tuning.",
    383       "evidence": "Table 6 shows Mistral-7B fine-tuned on 30% of layers (LoRA w/ ILA 30%) achieves MMLU 61.89 vs LoRA's 61.95 and higher MT-Bench (4.75 vs 4.68). Table 7 shows QLoRA w/ ILA (75%) improves over QLoRA on most metrics.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Layer importance rankings are stable across random seeds and training milestones.",
    388       "evidence": "Table 3 shows Jaccard similarity of 0.91-0.92 across different random seeds. Table 4 shows 0.90-0.93 similarity between 25%-100% training milestones (but only 0.69 at 1%).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "A single layer importance ranking search suffices for different alignment tasks using the same architecture.",
    393       "evidence": "Rankings computed on No Robots were applied to LIMA and Alpaca-GPT4 datasets with consistent improvements (Tables 5, 17, 18). Cross-dataset intersection further improves results (Table 12).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "ILA findings extend from alignment to reasoning tasks.",
    398       "evidence": "Figure 3 shows 86% Jaccard similarity in layer importance between LIMO and s1.1 datasets on Qwen2.5-7B-Instruct. Table 10 shows freezing 25% of layers improves MATH500 (77→79) and AIME (13.33→16.67). Based on a single model and two datasets only.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No error bars or uncertainty quantification",
    405       "detail": "Despite averaging results over 3 runs, no standard deviation, confidence intervals, or any spread measure is reported in any table. Performance differences between methods are often small (e.g., 0.12 MMLU points) and could easily be within noise. Without variance information, the reader cannot assess whether reported improvements are meaningful."
    406     },
    407     {
    408       "flag": "No statistical significance tests",
    409       "detail": "All comparative claims ('ILA consistently outperformed LoRA', 'freezing unimportant layers improves performance') are based solely on comparing raw numbers. Many reported improvements are small (e.g., 0.19-1.20 points on MMLU), and without significance tests, it is unclear whether any differences are statistically meaningful."
    410     },
    411     {
    412       "flag": "No limitations section",
    413       "detail": "The paper has no dedicated limitations section and does not discuss threats to validity, scope boundaries, or methodological weaknesses. The conclusion pivots to extending findings to reasoning without addressing limitations of the alignment experiments."
    414     },
    415     {
    416       "flag": "Overreach in reasoning claims",
    417       "detail": "The extension to LLM reasoning (Section 5) is based on a single model (Qwen2.5-7B-Instruct), two datasets (LIMO, s1.1), and two benchmarks (MATH500, AIME with only 30 problems). The abstract frames this as a general finding about reasoning but the evidence is preliminary at best."
    418     },
    419     {
    420       "flag": "GPT-4o as evaluator without version or validation",
    421       "detail": "GPT-4o is used to score conversational quality on Vicuna and MT-Bench without specifying the GPT-4o version/snapshot, without providing the evaluation prompts, and without any validation of GPT-4o's reliability as a judge for these comparisons."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Lima: Less is more for alignment",
    427       "authors": ["Chunting Zhou", "Pengfei Liu", "Puxin Xu", "Srini Iyer", "Jiao Sun", "Yuning Mao", "Xuezhe Ma"],
    428       "year": 2023,
    429       "relevance": "Key prior work showing alignment primarily refines style with small datasets, which this paper builds upon to investigate layer-level effects."
    430     },
    431     {
    432       "title": "The unlocking spell on base LLMs: Rethinking alignment via in-context learning",
    433       "authors": ["Bill Yuchen Lin", "Abhilasha Ravichander", "Ximing Lu"],
    434       "year": 2023,
    435       "arxiv_id": "2312.01552",
    436       "relevance": "URIAL finding that alignment mainly modifies stylistic tokens, directly motivating this paper's layer-level analysis."
    437     },
    438     {
    439       "title": "LoRA: Low-rank adaptation of large language models",
    440       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    441       "year": 2021,
    442       "relevance": "Core PEFT method used throughout this paper for efficient layer importance identification and as a baseline."
    443     },
    444     {
    445       "title": "Direct preference optimization: Your language model is secretly a reward model",
    446       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    447       "year": 2024,
    448       "relevance": "Major alignment method (DPO) representing the preference learning approach to LLM alignment."
    449     },
    450     {
    451       "title": "QLoRA: Efficient finetuning of quantized LLMs",
    452       "authors": ["Tim Dettmers", "Artidoro Pagnoni", "Ari Holtzman", "Luke Zettlemoyer"],
    453       "year": 2023,
    454       "arxiv_id": "2305.14314",
    455       "relevance": "Efficient fine-tuning method that ILA integrates with to further reduce costs (Tables 7, 11)."
    456     },
    457     {
    458       "title": "AdaLoRA: Adaptive budget allocation for parameter-efficient fine-tuning",
    459       "authors": ["Qingru Zhang", "Minshuo Chen", "Alexander Bukharin"],
    460       "year": 2023,
    461       "arxiv_id": "2303.10512",
    462       "relevance": "Adaptive rank allocation baseline that addresses parameter importance from a different angle than ILA's layer-level approach."
    463     },
    464     {
    465       "title": "LISA: Layerwise importance sampling for memory-efficient large language model fine-tuning",
    466       "authors": ["Rui Pan", "Xiang Liu", "Shizhe Diao"],
    467       "year": 2024,
    468       "arxiv_id": "2403.17919",
    469       "relevance": "Related work on layerwise importance for fine-tuning efficiency, directly comparable to ILA's approach."
    470     },
    471     {
    472       "title": "The unreasonable ineffectiveness of the deeper layers",
    473       "authors": ["Andrey Gromov", "Kushal Tirumala", "Hassan Shapourian"],
    474       "year": 2024,
    475       "arxiv_id": "2403.17887",
    476       "relevance": "Layer importance analysis in LLMs showing deeper layers can be pruned, related to ILA's finding about non-essential layers."
    477     },
    478     {
    479       "title": "LIMO: Less is more for reasoning",
    480       "authors": ["Yixin Ye", "Zhen Huang", "Yang Xiao"],
    481       "year": 2025,
    482       "arxiv_id": "2502.03387",
    483       "relevance": "Reasoning dataset used in ILA's extension experiments, demonstrating that limited high-quality data enhances LLM reasoning."
    484     },
    485     {
    486       "title": "s1: Simple test-time scaling",
    487       "authors": ["Niklas Muennighoff", "Zitong Yang", "Weijia Shi"],
    488       "year": 2025,
    489       "arxiv_id": "2501.19393",
    490       "relevance": "Test-time scaling approach whose dataset (s1.1) is used in ILA's reasoning extension experiments."
    491     },
    492     {
    493       "title": "Task-specific skill localization in fine-tuned language models",
    494       "authors": ["Abhishek Panigrahi", "Nikunj Saunshi", "Haoyu Zhao", "Sanjeev Arora"],
    495       "year": 2023,
    496       "relevance": "Skill localization in fine-tuned LLMs, directly related to ILA's approach of identifying alignment-critical layers."
    497     },
    498     {
    499       "title": "Gradient-mask tuning elevates the upper limits of LLM performance",
    500       "authors": ["Haoling Li", "Xin Zhang", "Xiao Liu"],
    501       "year": 2024,
    502       "arxiv_id": "2406.15330",
    503       "relevance": "Related gradient-based masking approach for selective fine-tuning of LLMs."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "Practitioners fine-tuning LLMs could use ILA to reduce compute costs by freezing unimportant layers or selectively tuning critical ones."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "The ~90% layer overlap across datasets is a specific novel finding, but the broader insight that alignment is shallow builds on prior work (LIMA, URIAL)."
    514     },
    515     "fear_safety": {
    516       "score": 0,
    517       "justification": "No safety or security concerns raised by this work."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy or conflicting claims with other research."
    522     },
    523     "demo_ability": {
    524       "score": 0,
    525       "justification": "No code, demo, or tool is released."
    526     },
    527     "brand_recognition": {
    528       "score": 0,
    529       "justification": "From Hong Kong Polytechnic University and Du Xiaoman Financial, not a high-profile AI lab."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs