scan-v5.json (25561B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "On the Limits of Layer Pruning for Generative Reasoning in LLMs", 6 "authors": [ 7 "Safal Shrestha", 8 "Anubhav Shrestha", 9 "Aadim Nepal", 10 "Minwu Kim", 11 "Keith Ross" 12 ], 13 "year": 2026, 14 "venue": "arXiv", 15 "arxiv_id": "2602.01997", 16 "doi": null 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are verified by Tables 1–3 and Figures 1–8: classification vs. generative gap, SGR achieving up to 90% classification retention and 20–30pp generative gains, and fundamental recovery limits.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper makes causal claims (layer removal degrades arithmetic; SGR improves recovery) and supports them through single-layer ablations, controlled arithmetic probes, and multi-model comparisons across 4 architectures.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Claims are bounded to instruct-tuned 7–8B models under constrained post-training regimes; the paper explicitly limits generalization to settings without pretraining-scale data or compute.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section 4.1 explicitly tests and rejects text degeneration as a sufficient explanation for reasoning failures, and Sections 4.2–4.3 identify arithmetic and syntactic degradation as the true root causes.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper clearly distinguishes benchmark accuracy (GSM8K, HumanEval+) from underlying capabilities (arithmetic logprob accuracy, parenthesis tracking error rates), and uses controlled arithmetic probes to isolate specific abilities.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated Limitations section; limitations are woven into the combined 'Discussion & Conclusion' (Section 7), which does not meet the criterion of a dedicated section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No formal threats-to-validity discussion is present; the paper acknowledges model-family variability and scope restriction to constrained settings, but does not enumerate specific threats such as benchmark contamination or generalization across model scales.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly bounds claims to 'realistic post-training constraints, without access to pretraining-scale data or compute' and notes results are limited to the 4 model families tested.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgements state: 'NYU Abu Dhabi Center for Artificial Intelligence and Robotics, funded by Tamkeen under the Research Institute Award CG010.'", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors are listed as from the Department of Computer Science, New York University Abu Dhabi; no affiliation with LLM vendors being evaluated.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Tamkeen is a UAE government research funding body with no commercial stake in LLM compression outcomes.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "The Impact Statement contains no competing interests declaration; no statement about patents, equity, or consulting relationships is present anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are operationally defined: classification benchmarks are log-likelihood scoring tasks; generative benchmarks require multi-token solution generation; BI and Reverse Order pruning strategies are explained in Appendix A.6.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 1 lists four explicit bullet-point contributions: sensitivity characterization, systematic failure mode analysis, SGR recovery strategy, and post-recovery arithmetic/syntactic analysis.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 explicitly positions this work against prior approaches (knowledge distillation, continued pretraining, lightweight module replacement), explaining why existing methods are insufficient and what gap this paper fills.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code is available at https://github.com/safal312/on-the-limits-of-layer-pruning, linked in the paper header footnote.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All evaluation benchmarks (GSM8K, HumanEval+, MBPP+, XSUM, HellaSwag, PIQA, etc.) are standard publicly available datasets; models and data also noted as released on HuggingFace.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Appendix A.6 mentions single A100 80GB GPU and QLoRA with NF4 quantization, but no requirements.txt, Dockerfile, or dependency specification is provided in the paper.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Experimental details in Appendix A.6 cover hyperparameters but do not provide step-by-step reproduction instructions; a reader would need to guess implementation details beyond what is stated.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported for any result; all tables report single-run normalized scores.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used for any comparative claim; differences in percentage points are reported without p-values or hypothesis tests.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Effect sizes are reported throughout in percentage point improvements (e.g., '+31.2pp for Llama BI+Dolci SGR vs. Dolci') with baseline context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The arithmetic ablation uses 200 problems (Appendix A.4) without justification or power analysis; benchmark evaluation sizes are determined by the benchmark, not chosen by the authors.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance, standard deviation, or run-to-run variability is reported for any experiment; all results are single-point estimates.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Baselines include: unpruned base models, standard SFT on open-source datasets (Alpaca, Dolci), and results from prior work (Lu et al. 2024) for direct comparison.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Comparisons use results from Lu et al. (2024) and contemporary pruning strategies (BI, Reverse Order) from 2024–2025 literature.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4 conducts systematic single-layer ablations across all layers of 3 model families to identify which layers are sensitive to removal.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Evaluation uses 11 benchmarks: 7 classification (HellaSwag, PIQA, MMLU, WinoGrande, OBQA, ARC-E, ARC-C) and 4 generative (GSM8K, HumanEval+, MBPP+, XSUM), plus arithmetic accuracy and syntactic error rates.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is not applicable; all benchmarks use automated evaluation metrics appropriate for math, code, and classification tasks.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "All standard benchmarks (GSM8K, HumanEval+, MBPP+, classification benchmarks) have held-out test sets; finetuning is on training splits only.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by model family (4 models), pruning strategy (BI/Reverse/Iterative), training data (Alpaca/Dolci/SGR), and task type (classification vs. generative) throughout Tables 1–6.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Appendix A.3 shows a concrete arithmetic mistake example (32×6=364) and Appendix A.5 shows a parenthesis mismatch error from a pruned model's code output.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Appendix A.6.2 explicitly shows that even under ideal conditions (task-aligned pruning metric and training data), GSM8K performance cannot be fully recovered, demonstrating a fundamental limit.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model versions are stated: Qwen-2.5-7B-Instruct, Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Gemma2-2B-It.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "The arithmetic probe prompt format is explicitly shown: 'Question: What is (7 + 5) - 6? Answer:'; standard benchmarks use established prompt formats via lm-evaluation-harness.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Appendix A.6 reports: QLoRA with 4-bit NF4 quantization, learning rate 2×10^-4, batch size 8, 50 warmup steps, bf16, sequence length 8192, single A100 80GB GPU.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used; the paper evaluates standard language model inference without agent frameworks.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "For arithmetic ablation, the EleutherAI/arithmetic dataset with single-digit, three-operations subset is specified; output space is restricted to digits 0–9; SGR generation procedure is described (prompts fed to unpruned base model).", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Raw model outputs (e.g., pruned model responses) are not released as part of the paper; only aggregate benchmark scores are reported.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "SGR data generation is described (unpruned base model generates 8 responses per prompt); benchmark data sources are identified; arithmetic subset selection criteria are stated.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; standard public benchmarks are used without recruitment.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The SGR pipeline is documented: (1) prompt extraction from open-source datasets, (2) response generation by unpruned base model, (3) SFT of pruned model on self-generated pairs; evaluation uses lm-evaluation-harness.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Training data cutoffs for Qwen2.5, LLaMA3.1, Mistral, and Gemma2 are not stated or discussed anywhere in the paper.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "Potential overlap between model pretraining data and evaluation benchmarks (GSM8K, HumanEval+, MMLU) is not discussed at all.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "No mention of whether GSM8K, HumanEval+, or MMLU examples were available before training cutoffs; contamination is entirely unaddressed.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference cost or latency measurements are reported; the paper focuses on recovery quality, not computational cost of the compressed models.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Only the hardware platform is mentioned (single A100 80GB for finetuning, NYU Abu Dhabi HPC cluster); no GPU-hours, FLOPs, or total compute budget is stated.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Generative reasoning tasks are substantially more sensitive to layer pruning than classification benchmarks, with classification retaining ~80%+ but generative tasks dropping to 14–32% retention.", 375 "evidence": "Table 1 shows classification mean retention of 0.655–0.839 vs. generative retention of 0.143–0.327 across four model families at 25% pruning.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Even single-layer removal can cause severe performance drops on GSM8K and HumanEval+, with sharp drops at specific layer positions varying by model family.", 380 "evidence": "Figure 1 shows sharp drops in GSM8K and HumanEval+ performance for specific layers across Qwen, Llama, and Mistral, while XSUM remains largely stable.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Layer pruning disrupts arithmetic computation capabilities, causing structural loss beyond text degeneration effects.", 385 "evidence": "Figure 3 shows substantial drops in arithmetic logprob and accuracy after pruning specific Llama layers using a controlled evaluation that avoids multi-token generation demands.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Text degeneration (repetition, Self-BLEU4) does not fully explain reasoning failures; performance drops occur even when text generation quality remains intact.", 390 "evidence": "Section 4.1 shows that in Qwen and Llama, sharp performance drops on math/coding coincide with minimal degeneration metrics; Mistral shows the reverse pattern.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Self-Generated Response (SGR) finetuning consistently outperforms finetuning on external open-source data for generative recovery, by up to 31.2 percentage points.", 395 "evidence": "Table 2 shows SGR (BI + S.Dolci) achieves 63.4% generative retention for Llama vs. 32.2% for standard BI + Dolci, a +31.2pp improvement.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Recovery of generative reasoning remains fundamentally limited relative to classification even under favorable conditions, with a persistent gap even after SGR finetuning.", 400 "evidence": "Table 2 shows Llama SGR achieves 90.3% classification vs. 63.4% generative retention; Appendix A.6.2 shows full recovery is impossible even with task-aligned training data.", 401 "supported": "strong" 402 }, 403 { 404 "claim": "Layer pruning is viable for generative tasks primarily at lower pruning ratios (~10–15%), with ~80% retention achievable at 10% pruning.", 405 "evidence": "Figure 6 and Table 6 show ~85–90% generative retention at 2 layers pruned and declining performance beyond 10–15% pruning ratios for both Llama and Qwen.", 406 "supported": "strong" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval", 411 "observational" 412 ], 413 "key_findings": "Layer pruning severely degrades generative reasoning (math, coding) while classification performance is largely preserved—a gap not explained by text degeneration alone. Pruning disrupts core algorithmic capabilities: arithmetic accuracy drops to ~34% average at 25% pruning, and parenthesis-matching errors spike at specific layers. Self-Generated Response (SGR) finetuning—using the unpruned base model's outputs as training targets—consistently outperforms open-source SFT by 20–31pp on generative benchmarks. Despite these gains, recovery is fundamentally limited: even under ideal task-aligned conditions, pruned models cannot fully recover baseline performance, suggesting that depth reduction irreversibly disrupts functional circuits required for multi-step reasoning.", 414 "red_flags": [ 415 { 416 "flag": "No error bars or variance", 417 "detail": "All results are single-run normalized scores with no confidence intervals, standard deviations, or repeated experiment variance reported across any of the main tables." 418 }, 419 { 420 "flag": "No statistical significance tests", 421 "detail": "All comparative claims (SGR vs. baseline SFT, classification vs. generative) are made without statistical hypothesis tests; differences could be within noise for individual benchmarks." 422 }, 423 { 424 "flag": "Contamination unaddressed", 425 "detail": "Training cutoffs for all four evaluated models are not stated, and potential overlap between pretraining data and evaluation benchmarks (GSM8K, HumanEval+, MMLU) is never discussed." 426 }, 427 { 428 "flag": "Limited model scale", 429 "detail": "All experiments use 2B–8B parameter instruct-tuned models; findings may not generalize to larger models (70B+) or base models, which the paper does not acknowledge as a limitation." 430 }, 431 { 432 "flag": "No competing interests statement", 433 "detail": "The Impact Statement contains no declaration of competing interests, patents, or financial relationships." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "The Unreasonable Ineffectiveness of the Deeper Layers", 439 "relevance": "Prior work showing classification robustness under layer pruning that this paper challenges for generative tasks." 440 }, 441 { 442 "title": "ShortGPT: Layers in Large Language Models Are More Redundant Than You Expect", 443 "relevance": "Key prior work claiming layer redundancy; this paper refutes the generality of that claim for generative reasoning." 444 }, 445 { 446 "title": "Reassessing Layer Pruning in LLMs: New Insights and Methods", 447 "relevance": "Direct comparison baseline whose results are incorporated into Table 1 and Table 3." 448 }, 449 { 450 "title": "Compact Language Models via Pruning and Knowledge Distillation", 451 "relevance": "Large-scale recovery approach (Minitron) that this paper argues is impractical under constrained settings." 452 }, 453 { 454 "title": "LLM Pruning and Distillation in Practice: The Minitron Approach", 455 "relevance": "Another knowledge distillation recovery baseline that requires pretraining-scale data." 456 }, 457 { 458 "title": "When Fewer Layers Break More Chains: Layer Pruning Harms Test-Time Scaling in LLMs", 459 "relevance": "Related concurrent work on layer pruning failure modes for reasoning tasks." 460 }, 461 { 462 "title": "Layer Importance for Mathematical Reasoning Is Forged in Pre-Training and Invariant After Post-Training", 463 "relevance": "Co-authored companion paper on mathematical reasoning sensitivity to layer removal." 464 }, 465 { 466 "title": "LLM Circuit Analyses Are Consistent Across Training and Scale", 467 "relevance": "Cited to explain why functional circuits require pretraining-scale data to form and are hard to reconstruct post-pruning." 468 }, 469 { 470 "title": "Training Verifiers to Solve Math Word Problems (GSM8K)", 471 "relevance": "Primary mathematical reasoning benchmark used throughout the evaluation." 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 2, 477 "justification": "Directly informs practitioners doing model compression: provides clear guidance on when layer pruning is viable and when it will fail." 478 }, 479 "surprise_contrarian": { 480 "score": 2, 481 "justification": "Challenges the optimistic literature on layer redundancy and classification-validated pruning by showing classification success is a poor proxy for generative capability." 482 }, 483 "fear_safety": { 484 "score": 0, 485 "justification": "No safety or risk implications; the paper is about model compression quality, not alignment or misuse." 486 }, 487 "drama_conflict": { 488 "score": 1, 489 "justification": "Modest conflict angle: exposes limits of a popular technique endorsed by prior work, but not a heated public controversy." 490 }, 491 "demo_ability": { 492 "score": 1, 493 "justification": "Code is released on GitHub but running the experiments requires an A100 GPU and significant compute, limiting casual reproducibility." 494 }, 495 "brand_recognition": { 496 "score": 1, 497 "justification": "NYU Abu Dhabi is a recognized institution; the paper uses prominent models (LLaMA, Qwen, Mistral, Gemma) but is not from a major AI lab." 498 } 499 }, 500 "hn_data": { 501 "threads": [], 502 "top_points": 0, 503 "total_points": 0, 504 "total_comments": 0 505 } 506 }