scan.json (25435B)
1 { 2 "paper": { 3 "title": "On the Limits of Layer Pruning for Generative Reasoning in LLMs", 4 "authors": ["Safal Shrestha", "Anubhav Shrestha", "Aadim Nepal", "Minwu Kim", "Keith Ross"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.01997" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Layer pruning preserves classification performance (80%+ retention at 25% pruning) but severely degrades generative reasoning tasks. The degradation stems from disruption of core algorithmic capabilities (arithmetic, parenthesis tracking), not just surface-level text degeneration. Supervised finetuning with Self-Generated Responses (SGR) improves recovery by 20-30pp over standard finetuning but cannot fully restore generative reasoning, especially at aggressive pruning ratios. Moderate pruning (≤10-15%) with SGR offers the best practical trade-off.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "GitHub repository provided: https://github.com/safal312/on-the-limits-of-layer-pruning (Section 1, footnote 1)." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Data and models available on HuggingFace: https://huggingface.co/collections/safal312/on-the-limits-of-generative-reasoning-in-llms (Section 1, footnote 2). All benchmarks used are public." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions 'single A100 80GB GPU' and QLoRA settings but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link exists but the paper itself does not describe how to reproduce experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are reported as point estimates (retention ratios). No confidence intervals or error bars are shown in any table or figure." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes numerous comparative claims (SGR outperforms standard finetuning, classification retains more than generative) but no statistical significance tests are reported." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes are reported as percentage point differences with baseline context throughout, e.g., 'gain of +25.9 percentage points relative to finetuning on Alpaca alone' (Section 5.1), and retention ratios normalize against baseline performance." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for the number of benchmarks, models, or the 200 arithmetic problems used in the ablation study." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Multiple baselines included: unpruned model, standard SFT with open-source data (Alpaca, Dolci), and results from Lu et al. (2024) are compared against the proposed SGR approach." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include recent pruning strategies (BI, Reverse Order from Men et al. 2025, Lu et al. 2024) and recent model families (Llama 3.1, Qwen 2.5, Gemma 2)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Extensive ablation: single-layer removal analysis (Section 4), comparison across pruning strategies (BI, Reverse, Iterative), training data sources (Alpaca vs Dolci, standard vs SGR), and pruning ratios (Section 5.3)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics across classification (HellaSwag, PIQA, MMLU, WinoGrande, OBQA, ARC-E, ARC-C) and generative (GSM8K, HumanEval+, MBPP+, XSUM) benchmarks, plus text degeneration metrics (4-gram repetition, Self-BLEU4)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is included. All evaluation is automated via benchmarks and metrics. Manual inspection is mentioned briefly for qualitative examples but not as systematic evaluation." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Standard benchmark test sets are used (GSM8K test set mentioned explicitly in A.6.2, HumanEval+/MBPP+ are standard held-out sets). Finetuning uses separate training data (Alpaca, Dolci)." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Per-benchmark breakdowns are provided in all tables (Tables 1-6), showing individual results for each classification and generative benchmark rather than only aggregate scores." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Detailed failure analysis in Sections 4.1-4.3: text degeneration, arithmetic errors (Appendix A.3 example), parenthesis tracking failures (Appendix A.5 example), and syntactic error distributions (Figure 4)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "A central finding is negative: SGR finetuning cannot fully recover generative reasoning. Section A.6.2 reports an upper-bound experiment that still fails. Iterative pruning does not consistently outperform simpler strategies." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about classification vs generative gap are supported by Tables 1-3; 'up to 90% baseline performance' on classification is shown in Table 2; '20-30 percentage points' gains on generative are shown across multiple ablations." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "Causal claims ('pruning disrupts arithmetic capability') are supported by controlled single-layer ablation experiments isolating the effect of individual layer removal (Figure 3). The controlled evaluation with logprob measurement (Section 4.2) strengthens causal inference." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper tests across 4 model families and explicitly notes model-specific differences (e.g., 'Qwen exhibits less layer redundancy'). The title appropriately focuses on 'layer pruning' not all compression. Discussion notes limitations 'under realistic post-training constraints.'" 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 4.1 explicitly rules out text degeneration as the sole explanation for generative failure, showing 'degeneration alone does not fully account for failures.' Section 7 discusses alternative theories about shallow vs deep subnetworks (Petty et al., Telgarsky)." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper carefully distinguishes between proxy metrics (benchmark accuracy, retention ratios) and the underlying capabilities (arithmetic competence, syntactic ability, reasoning). Section 4.2 explicitly creates a controlled evaluation to isolate arithmetic from generation demands." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model versions are named: Qwen-2.5-7B-Instruct, Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Gemma2-2B-It. These are sufficiently specific versioned model names." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "The arithmetic evaluation prompt format is provided: 'Question: What is (7 + 5) - 6? Answer:' (Section 4.2). The paper primarily uses standard benchmark evaluation harnesses (lm-evaluation-harness) with standard prompting." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "QLoRA hyperparameters reported in A.6.1: '4-bit NF4 quantization, learning rate 2×10⁻⁴, batch size 8, constant learning rate with 50 warmup steps, bf16 training, sequence length 8192.' Training epochs specified per dataset." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. This is a model compression study using standard evaluation frameworks." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "SGR data generation process is described: 'we pass only the prompts from the open-source datasets to the base model and use its generated outputs' (Section 5). Alpaca-cleaned and Dolci datasets identified. For upper-bound experiment, 'eight responses from the unpruned Qwen model' per example (A.6.2)." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 7 (Discussion & Conclusion) contains substantive discussion of limitations, including the persistent gap in recovery and the constraints of post-training settings." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Specific limitations discussed: recovery may only work 'under constrained settings studied here' (Section 7), QLoRA vs full finetuning comparison (A.6.3), model-specific behavior (Qwen vs Llama redundancy differences), and acknowledged that 'aggressive pruning may be impractical.'" 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper explicitly states scope: 'Rather than proposing a new pruning algorithm, our goal is to characterize the limits' (Section 1). It bounds results to 'realistic post-training constraints, without access to pretraining-scale data or compute.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "Code and models are released on GitHub and HuggingFace, which should include the experimental outputs. All benchmarks used are publicly available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Data sources are clearly described: standard public benchmarks (GSM8K, HumanEval+, MBPP+, XSUM, classification benchmarks), plus Alpaca-cleaned and Dolci datasets for finetuning. Arithmetic evaluation uses EleutherAI/arithmetic dataset (A.4)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: pruning strategy → optional finetuning → evaluation on benchmarks. SGR data generation process is described. The iterative pruning algorithm is formally specified (Algorithm 1)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgements section: 'submitted in part by the NYU Abu Dhabi Center for Artificial Intelligence and Robotics, funded by Tamkeen under the Research Institute Award CG010.'" 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All authors listed as Department of Computer Science, New York University Abu Dhabi. No conflict with evaluated products (they evaluate open-source models from Meta, Google, Alibaba, Mistral)." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "Funder (Tamkeen/NYU Abu Dhabi) is an academic research institute with no financial stake in the outcome of layer pruning evaluations." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is included in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the four models evaluated, despite using benchmarks like GSM8K and HumanEval that could be in training data." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether benchmark problems (GSM8K, HumanEval+) appeared in the training data of the evaluated models." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "GSM8K (2021) and HumanEval (2021) are well-known to be in many models' training data. All models tested were released in 2024-2025, well after these benchmarks. No contamination discussion." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost or latency numbers are reported despite evaluating model compression methods where efficiency is a core motivation." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "Hardware is mentioned ('single A100 80GB GPU') but total GPU hours, training time, or computational budget are not stated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multiple seed experiments reported. All results appear to be single-run numbers." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never explicitly stated." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported. The QLoRA settings appear fixed but no justification for these choices is given." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "Multiple configurations are compared (different pruning strategies, datasets) but the selection of reported configurations is not justified beyond standard choices from prior work." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper proposes SGR and compares against baselines without acknowledging self-comparison bias. However, many baselines are sourced from Lu et al. (2024), partially mitigating this." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "No performance-vs-compute analysis despite SGR requiring generating responses from the full unpruned model, which is an additional compute cost not compared against distillation approaches." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "The paper explicitly discusses construct validity: Section 4.2 argues that standard generative benchmarks 'can obscure true abilities due to auxiliary demands' and designs controlled evaluations to isolate arithmetic competence. Section 4 discusses what benchmarks actually measure." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved. Models are evaluated directly on benchmarks." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "Not discussed. GSM8K and HumanEval were published years before the models' training, creating clear temporal leakage risk." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Not discussed. HumanEval+ provides function signatures as prefixes which could interact with memorized solutions." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Generative reasoning tasks are substantially more sensitive to layer pruning than classification benchmarks, with even single-layer removal causing severe degradation.", 364 "evidence": "Table 1 shows classification retention of 65-84% vs generative retention of 14-33% at 25% pruning across 4 models. Figure 1 shows sharp drops from single-layer removal on GSM8K and HumanEval+.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Layer pruning disrupts core algorithmic capabilities (arithmetic computation, parenthesis tracking), not just surface-level text generation.", 369 "evidence": "Section 4.2: controlled arithmetic evaluation shows accuracy dropping to 34.3% after pruning (Figure 7). Figure 3 shows logprob degradation. Figure 4 shows syntactic error spikes. Section 4.1 demonstrates degeneration alone doesn't explain failures.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "SGR finetuning achieves gains of up to 20-30 percentage points on generative benchmarks compared to standard finetuning.", 374 "evidence": "Table 2: Llama BI + S.Dolci achieves 63.4% generative retention vs 32.2% for BI + Dolci (+31.2pp). Similar gains observed across models (Table 5).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Recovery for generative reasoning remains fundamentally limited even after SGR finetuning.", 379 "evidence": "Figure 7: arithmetic accuracy recovers to only 58% from 34.3% (base: 77.3%). Section A.6.2 upper-bound experiment with task-aligned full SFT still cannot recover baseline. Persistent gap between classification (90%) and generative (63%) retention.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Moderate pruning ratios (≤10-15%) with SGR offer a practical trade-off for preserving generative reasoning.", 384 "evidence": "Figure 6 and Table 6: at 2 layers removed, ~85-90% retention; at ~10%, ~80% for BI/Iterative. Beyond 15%, steady decline.", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "No variance or uncertainty reporting", 391 "detail": "All results are single-run point estimates with no error bars, standard deviations, or confidence intervals. Given that pruning and finetuning outcomes can be sensitive to random seeds, the stability of results is unknown." 392 }, 393 { 394 "flag": "No contamination analysis", 395 "detail": "GSM8K and HumanEval were published in 2021. All evaluated models (Llama 3.1, Qwen 2.5, etc.) were trained on data collected well after 2021 and likely include these benchmarks. Performance differences between pruned and unpruned models could partly reflect memorization rather than reasoning capability." 396 }, 397 { 398 "flag": "Missing compute cost comparison", 399 "detail": "SGR requires running inference with the full unpruned model to generate training targets, but this cost is never quantified or compared against the distillation approaches the paper argues are too expensive." 400 } 401 ], 402 "cited_papers": [ 403 { 404 "title": "The unreasonable ineffectiveness of the deeper layers", 405 "authors": ["Andrei Gromov", "Kushal Tirumala", "Hassan Shapourian", "Paolo Glorioso", "Daniel A. Roberts"], 406 "year": 2024, 407 "arxiv_id": "2403.17887", 408 "relevance": "Key prior work on layer redundancy in LLMs, finding that deep layers can be removed with minimal classification impact." 409 }, 410 { 411 "title": "Shortened LLaMA: Depth Pruning for Large Language Models with Comparison of Retraining Methods", 412 "authors": ["Bo-Kyeong Kim"], 413 "year": 2024, 414 "arxiv_id": "2402.02834", 415 "relevance": "Comparative study of retraining methods after depth pruning, directly relevant to compression and efficiency." 416 }, 417 { 418 "title": "ShortGPT: Layers in Large Language Models Are More Redundant Than You Expect", 419 "authors": ["Xin Men"], 420 "year": 2025, 421 "relevance": "Demonstrates layer redundancy using Block Influence metric, one of the main pruning strategies evaluated." 422 }, 423 { 424 "title": "Compact Language Models via Pruning and Knowledge Distillation", 425 "authors": ["Saurav Muralidharan"], 426 "year": 2024, 427 "relevance": "Large-scale pruning and distillation approach (Minitron), represents the compute-heavy alternative to SGR." 428 }, 429 { 430 "title": "LLM Pruning and Distillation in Practice: The Minitron Approach", 431 "authors": ["Sharath Turuvekere Sreenivas"], 432 "year": 2024, 433 "arxiv_id": "2408.11796", 434 "relevance": "Practical pruning and distillation pipeline for LLMs, baseline for recovery methods." 435 }, 436 { 437 "title": "Reassessing Layer Pruning in LLMs: New Insights and Methods", 438 "authors": ["Yifei Lu"], 439 "year": 2024, 440 "arxiv_id": "2411.15558", 441 "relevance": "Source of several baseline results used in this paper; reassesses layer pruning methods." 442 }, 443 { 444 "title": "When Fewer Layers Break More Chains: Layer Pruning Harms Test-Time Scaling in LLMs", 445 "authors": ["Kaiqi Wang"], 446 "year": 2025, 447 "arxiv_id": "2510.22228", 448 "relevance": "Studies how layer pruning affects chain-of-thought reasoning and test-time scaling." 449 }, 450 { 451 "title": "Training compute-optimal large language models", 452 "authors": ["Jordan Hoffmann"], 453 "year": 2022, 454 "arxiv_id": "2203.15556", 455 "relevance": "Chinchilla scaling laws, foundational work on compute-optimal training relevant to model efficiency." 456 }, 457 { 458 "title": "SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot", 459 "authors": ["Elias Frantar", "Dan Alistarh"], 460 "year": 2023, 461 "arxiv_id": "2301.00774", 462 "relevance": "One-shot sparsification method for LLMs, complementary compression technique to layer pruning." 463 }, 464 { 465 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 466 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 467 "year": 2023, 468 "relevance": "Questions emergent abilities and evaluation metrics — relevant to how benchmark scores can be misleading." 469 } 470 ] 471 }