scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25561B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On the Limits of Layer Pruning for Generative Reasoning in LLMs",
      6     "authors": [
      7       "Safal Shrestha",
      8       "Anubhav Shrestha",
      9       "Aadim Nepal",
     10       "Minwu Kim",
     11       "Keith Ross"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.01997",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are verified by Tables 1–3 and Figures 1–8: classification vs. generative gap, SGR achieving up to 90% classification retention and 20–30pp generative gains, and fundamental recovery limits.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims (layer removal degrades arithmetic; SGR improves recovery) and supports them through single-layer ablations, controlled arithmetic probes, and multi-model comparisons across 4 architectures.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are bounded to instruct-tuned 7–8B models under constrained post-training regimes; the paper explicitly limits generalization to settings without pretraining-scale data or compute.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 4.1 explicitly tests and rejects text degeneration as a sufficient explanation for reasoning failures, and Sections 4.2–4.3 identify arithmetic and syntactic degradation as the true root causes.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly distinguishes benchmark accuracy (GSM8K, HumanEval+) from underlying capabilities (arithmetic logprob accuracy, parenthesis tracking error rates), and uses controlled arithmetic probes to isolate specific abilities.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated Limitations section; limitations are woven into the combined 'Discussion & Conclusion' (Section 7), which does not meet the criterion of a dedicated section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No formal threats-to-validity discussion is present; the paper acknowledges model-family variability and scope restriction to constrained settings, but does not enumerate specific threats such as benchmark contamination or generalization across model scales.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly bounds claims to 'realistic post-training constraints, without access to pretraining-scale data or compute' and notes results are limited to the 4 model families tested.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgements state: 'NYU Abu Dhabi Center for Artificial Intelligence and Robotics, funded by Tamkeen under the Research Institute Award CG010.'",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed as from the Department of Computer Science, New York University Abu Dhabi; no affiliation with LLM vendors being evaluated.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Tamkeen is a UAE government research funding body with no commercial stake in LLM compression outcomes.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The Impact Statement contains no competing interests declaration; no statement about patents, equity, or consulting relationships is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are operationally defined: classification benchmarks are log-likelihood scoring tasks; generative benchmarks require multi-token solution generation; BI and Reverse Order pruning strategies are explained in Appendix A.6.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 lists four explicit bullet-point contributions: sensitivity characterization, systematic failure mode analysis, SGR recovery strategy, and post-recovery arithmetic/syntactic analysis.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 explicitly positions this work against prior approaches (knowledge distillation, continued pretraining, lightweight module replacement), explaining why existing methods are insufficient and what gap this paper fills.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is available at https://github.com/safal312/on-the-limits-of-layer-pruning, linked in the paper header footnote.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All evaluation benchmarks (GSM8K, HumanEval+, MBPP+, XSUM, HellaSwag, PIQA, etc.) are standard publicly available datasets; models and data also noted as released on HuggingFace.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Appendix A.6 mentions single A100 80GB GPU and QLoRA with NF4 quantization, but no requirements.txt, Dockerfile, or dependency specification is provided in the paper.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Experimental details in Appendix A.6 cover hyperparameters but do not provide step-by-step reproduction instructions; a reader would need to guess implementation details beyond what is stated.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars are reported for any result; all tables report single-run normalized scores.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claim; differences in percentage points are reported without p-values or hypothesis tests.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are reported throughout in percentage point improvements (e.g., '+31.2pp for Llama BI+Dolci SGR vs. Dolci') with baseline context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The arithmetic ablation uses 200 problems (Appendix A.4) without justification or power analysis; benchmark evaluation sizes are determined by the benchmark, not chosen by the authors.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or run-to-run variability is reported for any experiment; all results are single-point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Baselines include: unpruned base models, standard SFT on open-source datasets (Alpaca, Dolci), and results from prior work (Lu et al. 2024) for direct comparison.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Comparisons use results from Lu et al. (2024) and contemporary pruning strategies (BI, Reverse Order) from 2024–2025 literature.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4 conducts systematic single-layer ablations across all layers of 3 model families to identify which layers are sensitive to removal.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Evaluation uses 11 benchmarks: 7 classification (HellaSwag, PIQA, MMLU, WinoGrande, OBQA, ARC-E, ARC-C) and 4 generative (GSM8K, HumanEval+, MBPP+, XSUM), plus arithmetic accuracy and syntactic error rates.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not applicable; all benchmarks use automated evaluation metrics appropriate for math, code, and classification tasks.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "All standard benchmarks (GSM8K, HumanEval+, MBPP+, classification benchmarks) have held-out test sets; finetuning is on training splits only.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model family (4 models), pruning strategy (BI/Reverse/Iterative), training data (Alpaca/Dolci/SGR), and task type (classification vs. generative) throughout Tables 1–6.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Appendix A.3 shows a concrete arithmetic mistake example (32×6=364) and Appendix A.5 shows a parenthesis mismatch error from a pruned model's code output.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Appendix A.6.2 explicitly shows that even under ideal conditions (task-aligned pruning metric and training data), GSM8K performance cannot be fully recovered, demonstrating a fundamental limit.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model versions are stated: Qwen-2.5-7B-Instruct, Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Gemma2-2B-It.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The arithmetic probe prompt format is explicitly shown: 'Question: What is (7 + 5) - 6? Answer:'; standard benchmarks use established prompt formats via lm-evaluation-harness.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Appendix A.6 reports: QLoRA with 4-bit NF4 quantization, learning rate 2×10^-4, batch size 8, 50 warmup steps, bf16, sequence length 8192, single A100 80GB GPU.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; the paper evaluates standard language model inference without agent frameworks.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "For arithmetic ablation, the EleutherAI/arithmetic dataset with single-digit, three-operations subset is specified; output space is restricted to digits 0–9; SGR generation procedure is described (prompts fed to unpruned base model).",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Raw model outputs (e.g., pruned model responses) are not released as part of the paper; only aggregate benchmark scores are reported.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "SGR data generation is described (unpruned base model generates 8 responses per prompt); benchmark data sources are identified; arithmetic subset selection criteria are stated.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard public benchmarks are used without recruitment.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The SGR pipeline is documented: (1) prompt extraction from open-source datasets, (2) response generation by unpruned base model, (3) SFT of pruned model on self-generated pairs; evaluation uses lm-evaluation-harness.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for Qwen2.5, LLaMA3.1, Mistral, and Gemma2 are not stated or discussed anywhere in the paper.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Potential overlap between model pretraining data and evaluation benchmarks (GSM8K, HumanEval+, MMLU) is not discussed at all.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No mention of whether GSM8K, HumanEval+, or MMLU examples were available before training cutoffs; contamination is entirely unaddressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost or latency measurements are reported; the paper focuses on recovery quality, not computational cost of the compressed models.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only the hardware platform is mentioned (single A100 80GB for finetuning, NYU Abu Dhabi HPC cluster); no GPU-hours, FLOPs, or total compute budget is stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Generative reasoning tasks are substantially more sensitive to layer pruning than classification benchmarks, with classification retaining ~80%+ but generative tasks dropping to 14–32% retention.",
    375       "evidence": "Table 1 shows classification mean retention of 0.655–0.839 vs. generative retention of 0.143–0.327 across four model families at 25% pruning.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Even single-layer removal can cause severe performance drops on GSM8K and HumanEval+, with sharp drops at specific layer positions varying by model family.",
    380       "evidence": "Figure 1 shows sharp drops in GSM8K and HumanEval+ performance for specific layers across Qwen, Llama, and Mistral, while XSUM remains largely stable.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Layer pruning disrupts arithmetic computation capabilities, causing structural loss beyond text degeneration effects.",
    385       "evidence": "Figure 3 shows substantial drops in arithmetic logprob and accuracy after pruning specific Llama layers using a controlled evaluation that avoids multi-token generation demands.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Text degeneration (repetition, Self-BLEU4) does not fully explain reasoning failures; performance drops occur even when text generation quality remains intact.",
    390       "evidence": "Section 4.1 shows that in Qwen and Llama, sharp performance drops on math/coding coincide with minimal degeneration metrics; Mistral shows the reverse pattern.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Self-Generated Response (SGR) finetuning consistently outperforms finetuning on external open-source data for generative recovery, by up to 31.2 percentage points.",
    395       "evidence": "Table 2 shows SGR (BI + S.Dolci) achieves 63.4% generative retention for Llama vs. 32.2% for standard BI + Dolci, a +31.2pp improvement.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Recovery of generative reasoning remains fundamentally limited relative to classification even under favorable conditions, with a persistent gap even after SGR finetuning.",
    400       "evidence": "Table 2 shows Llama SGR achieves 90.3% classification vs. 63.4% generative retention; Appendix A.6.2 shows full recovery is impossible even with task-aligned training data.",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Layer pruning is viable for generative tasks primarily at lower pruning ratios (~10–15%), with ~80% retention achievable at 10% pruning.",
    405       "evidence": "Figure 6 and Table 6 show ~85–90% generative retention at 2 layers pruned and declining performance beyond 10–15% pruning ratios for both Llama and Qwen.",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "observational"
    412   ],
    413   "key_findings": "Layer pruning severely degrades generative reasoning (math, coding) while classification performance is largely preserved—a gap not explained by text degeneration alone. Pruning disrupts core algorithmic capabilities: arithmetic accuracy drops to ~34% average at 25% pruning, and parenthesis-matching errors spike at specific layers. Self-Generated Response (SGR) finetuning—using the unpruned base model's outputs as training targets—consistently outperforms open-source SFT by 20–31pp on generative benchmarks. Despite these gains, recovery is fundamentally limited: even under ideal task-aligned conditions, pruned models cannot fully recover baseline performance, suggesting that depth reduction irreversibly disrupts functional circuits required for multi-step reasoning.",
    414   "red_flags": [
    415     {
    416       "flag": "No error bars or variance",
    417       "detail": "All results are single-run normalized scores with no confidence intervals, standard deviations, or repeated experiment variance reported across any of the main tables."
    418     },
    419     {
    420       "flag": "No statistical significance tests",
    421       "detail": "All comparative claims (SGR vs. baseline SFT, classification vs. generative) are made without statistical hypothesis tests; differences could be within noise for individual benchmarks."
    422     },
    423     {
    424       "flag": "Contamination unaddressed",
    425       "detail": "Training cutoffs for all four evaluated models are not stated, and potential overlap between pretraining data and evaluation benchmarks (GSM8K, HumanEval+, MMLU) is never discussed."
    426     },
    427     {
    428       "flag": "Limited model scale",
    429       "detail": "All experiments use 2B–8B parameter instruct-tuned models; findings may not generalize to larger models (70B+) or base models, which the paper does not acknowledge as a limitation."
    430     },
    431     {
    432       "flag": "No competing interests statement",
    433       "detail": "The Impact Statement contains no declaration of competing interests, patents, or financial relationships."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "The Unreasonable Ineffectiveness of the Deeper Layers",
    439       "relevance": "Prior work showing classification robustness under layer pruning that this paper challenges for generative tasks."
    440     },
    441     {
    442       "title": "ShortGPT: Layers in Large Language Models Are More Redundant Than You Expect",
    443       "relevance": "Key prior work claiming layer redundancy; this paper refutes the generality of that claim for generative reasoning."
    444     },
    445     {
    446       "title": "Reassessing Layer Pruning in LLMs: New Insights and Methods",
    447       "relevance": "Direct comparison baseline whose results are incorporated into Table 1 and Table 3."
    448     },
    449     {
    450       "title": "Compact Language Models via Pruning and Knowledge Distillation",
    451       "relevance": "Large-scale recovery approach (Minitron) that this paper argues is impractical under constrained settings."
    452     },
    453     {
    454       "title": "LLM Pruning and Distillation in Practice: The Minitron Approach",
    455       "relevance": "Another knowledge distillation recovery baseline that requires pretraining-scale data."
    456     },
    457     {
    458       "title": "When Fewer Layers Break More Chains: Layer Pruning Harms Test-Time Scaling in LLMs",
    459       "relevance": "Related concurrent work on layer pruning failure modes for reasoning tasks."
    460     },
    461     {
    462       "title": "Layer Importance for Mathematical Reasoning Is Forged in Pre-Training and Invariant After Post-Training",
    463       "relevance": "Co-authored companion paper on mathematical reasoning sensitivity to layer removal."
    464     },
    465     {
    466       "title": "LLM Circuit Analyses Are Consistent Across Training and Scale",
    467       "relevance": "Cited to explain why functional circuits require pretraining-scale data to form and are hard to reconstruct post-pruning."
    468     },
    469     {
    470       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    471       "relevance": "Primary mathematical reasoning benchmark used throughout the evaluation."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Directly informs practitioners doing model compression: provides clear guidance on when layer pruning is viable and when it will fail."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "Challenges the optimistic literature on layer redundancy and classification-validated pruning by showing classification success is a poor proxy for generative capability."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No safety or risk implications; the paper is about model compression quality, not alignment or misuse."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Modest conflict angle: exposes limits of a popular technique endorsed by prior work, but not a heated public controversy."
    490     },
    491     "demo_ability": {
    492       "score": 1,
    493       "justification": "Code is released on GitHub but running the experiments requires an A100 GPU and significant compute, limiting casual reproducibility."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "NYU Abu Dhabi is a recognized institution; the paper uses prominent models (LLaMA, Qwen, Mistral, Gemma) but is not from a major AI lab."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [],
    502     "top_points": 0,
    503     "total_points": 0,
    504     "total_comments": 0
    505   }
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs