scan-v5.json (26651B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Importance-Aware Data Selection for Efficient LLM Instruction Tuning", 6 "authors": [ 7 "Tingyu Jiang", 8 "Shen Li", 9 "Yiyao Song", 10 "Lan Zhang", 11 "Hualei Zhu" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2511.07074", 16 "doi": "10.48550/arXiv.2511.07074" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract's core claim that top 1% MIWV-selected data outperforms full-dataset training is directly supported by Table 1 and Figures 2–4, showing win rates consistently above 1.0 across multiple model and dataset combinations.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper makes causal claims that MIWV-selected data improves performance, and these are supported by ablation studies (Section 4.6) directly comparing MIWV vs. random, high-prompt-loss, and low-MIWV strategies on identical architectures.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper claims MIWV is a 'universal data selection method applicable to all LLMs' and 'can be applied to all LLMs,' but experiments cover only LLaMA-7B, LLaMA2-7B/13B, and Qwen2.5-7B/14B — a narrow slice of open-weight models with no closed-model or non-decoder testing.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not consider that high-MIWV samples may simply correlate with data complexity or diversity under alternative metrics; no alternative explanation for why ICL loss discrepancy identifies valuable data is discussed.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper claims MIWV 'enhances the model's capabilities' but measures GPT-4 pairwise win rates and benchmark scores; the distinction between GPT-4 judge preference and actual instruction-following capability is not discussed, and AlpacaEval is run on only 5% of its dataset 'due to budget constraints.'", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper; the conclusion only presents positive framing.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No threats to validity are discussed: GPT-4 as judge for data generated by GPT-like systems, potential circularity, lack of error bars, or domain specificity are never raised.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper asserts universal applicability without specifying conditions under which MIWV would not apply (e.g., datasets where ICL is unreliable, very long instruction samples, or non-English settings).", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgment states: 'This work was supported by JST CREST Grant Number JPMJCR21M2, including the AIP Challenge Program.'", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed on the title page: Alibaba Cloud Computing, Independent Researcher, and University of Tokyo.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "The funder is JST CREST, a Japanese government agency, which is independent of the outcome; however, the majority of authors are Alibaba Cloud employees with commercial interest in efficient fine-tuning — this institutional conflict is undisclosed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "MIWV is formally defined (Equation 8), and the paper defines prompt loss (Eq. 7) and loss without one-shot context (Eq. 5); 'instruction tuning' and 'in-context learning' are used with adequate contextual explanation for the field.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper explicitly lists three numbered contributions at the end of the Introduction: the universal data selection method, the MIWV metric, and experimental validation of superiority.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2.2 reviews eight specific competing methods (InstructMining, INSTAG, Alpagasus, QDIT, Deita, RECOST, SelectIT, DiverseEvol) and Section 4.5 directly benchmarks against them, explaining why each is weaker.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code repository, GitHub link, or promise of public release is mentioned anywhere in the paper.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All training datasets used (Alpaca, WizardLM) and evaluation benchmarks (Vicuna, Koala, LIMA, Self-instruct, Open LLM Leaderboard datasets) are existing publicly available resources used unmodified.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Hardware is stated (PyTorch 2.0.1, A100 80GB, Xeon 8369B) and basic training parameters are in Appendix A, but no requirements.txt, Dockerfile, or complete dependency list is provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper references the 'Alpaca codebase' and lists hyperparameters in Appendix A, but provides no step-by-step instructions for running the MIWV selection pipeline or reproducing results from scratch.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "Experiments are repeated three times and arithmetic means are reported, but no confidence intervals, error bars, or standard deviations appear in any result table or figure.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any of the comparative win rate claims; numerical differences are presented as evidence without p-values or confidence bounds.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Win rates (e.g., 1.127 for 1% vs. 1.000 baseline) and absolute benchmark score differences (e.g., ARC 57.25 vs. 54.35) provide interpretable relative magnitude of effects.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The number of test examples and training subsets are stated but not justified; no power analysis or discussion of whether the test sets are large enough to detect the observed differences is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper states 'all experiments are repeated three times with arithmetic mean results reported' but never reports standard deviation, variance, or spread across the three runs.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "The full-dataset-trained model serves as the primary baseline, and Table 2 compares against eight competing methods: IFD Score, SelectIT, Superfiltering, Alpagasus, Deita, DiverseEvol, Nuggets, and RECOST.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "All competing methods are from 2023–2024 (e.g., Superfiltering ACL 2024, SelectIT 2024, RECOST 2024), and MIWV outperforms all of them on win rate.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4.6 conducts ablations on data selection strategy (random, high prompt loss, low MIWV vs. MIWV) and embedding model choice (bge-en-large, multilingual-e5-large, gte-base-en-v1.5).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Three distinct evaluation frameworks are used: GPT-4 pairwise win rate, Huggingface Open LLM Leaderboard (ARC, HellaSwag, MMLU, TruthfulQA), and AlpacaEval.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "No human evaluation is conducted; GPT-4 is used as an automated judge as a substitute for human raters, with no validation of judge reliability reported.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Five distinct test datasets (Vicuna, Koala, WizardLM test set, Self-instruct, LIMA) totaling 1,030 instruction samples are held out from training data.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Table 1 breaks down Open LLM Leaderboard results by individual benchmark (ARC, HellaSwag, MMLU, TruthfulQA); Figure 2–13 break results down by individual test set.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "Section 4.8 presents only a success case study showing the 1% model answering a math question correctly; no failure cases or conditions where MIWV underperforms are examined.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The ablation (Figure 3a) explicitly shows that random selection, high-prompt-loss selection, and low-MIWV selection all produce models that underperform the full-dataset baseline.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "LLaMA-7B, LLaMA2-7B, LLaMA2-13B, Qwen2.5-7B, and Qwen2.5-14B are specific named versions; while checkpoint dates are absent, these are widely-known standard releases with sufficient specificity for reproducibility.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Appendix F (Table 5) provides the full system prompt and user prompt template used for GPT-4 evaluation; instruction tuning uses the standard Alpaca prompt format (referenced).", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Appendix A reports: Adam optimizer, learning rate 2×10⁻⁵, batch size 128, 3 epochs, and maximum input lengths (512/1024/2048) per model and dataset combination.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is a supervised fine-tuning paper with no agentic scaffolding; the question is not applicable.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Appendix A documents filtering of 'AI censure' samples from WizardLM and the embedding-based one-shot retrieval pipeline using bge-en-large with mean pooling and cosine similarity is fully described in Section 3.1.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "Alpaca and WizardLM datasets are publicly available standard resources; benchmark test sets (Vicuna, Koala, LIMA, etc.) are also publicly available.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Appendix A describes dataset construction: Alpaca uses self-instruction via text-DaVinci-003 (52,002 samples); WizardLM uses Evol-Instruct (63,655 samples); both are well-documented in their original papers.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; only standard public benchmark datasets are used.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Sections 3.1–3.3 document the full pipeline: embedding computation → cosine-similarity one-shot retrieval → MIWV loss computation → MIWV ranking → subset selection → instruction tuning.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The pretraining data cutoffs for LLaMA, LLaMA2, and Qwen2.5 are never stated; the paper does not discuss what data these base models were trained on.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of whether test sets (Vicuna, Koala, LIMA, etc.) may have been in the pretraining data of LLaMA or LLaMA2; contamination is entirely unaddressed.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "Open LLM Leaderboard benchmarks (ARC, HellaSwag, MMLU, TruthfulQA) were publicly available long before LLaMA/LLaMA2 pretraining cutoffs; no contamination assessment is performed.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants involved.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants involved.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants involved.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants involved.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants involved.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants involved.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants involved.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Table 2 reports that MIWV selection takes 85 minutes compared to 8–300 minutes for competing methods, making it the second-fastest method evaluated.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Hardware specifications are given (A100 80GB GPUs) but total GPU-hours for training, or the computational budget for running MIWV across the full datasets, are not reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Selecting the top 1% of data by MIWV produces a model that outperforms one trained on the full dataset across multiple evaluation benchmarks.", 375 "evidence": "Table 1 shows pairwise win rates of 1.063–1.127 for LLaMA2-7B/13B on Alpaca 1% vs. 100%; consistent improvement also on Open LLM Leaderboard average scores.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "MIWV is a universal metric applicable to all LLMs and both Alpaca-style and WizardLM-style instruction datasets.", 380 "evidence": "Tested on LLaMA-7B, LLaMA2-7B/13B, and Qwen2.5-7B/14B on two datasets; results are consistently positive but the model family coverage is narrow and all are open-weight decoder-only models.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "MIWV outperforms eight competing data selection methods in win rate on the WizardLM test set.", 385 "evidence": "Table 2 shows MIWV achieving the highest win rates at 1%, 5%, 10%, and 15% data fractions compared to IFD Score, SelectIT, Superfiltering, Alpagasus, Deita, DiverseEvol, Nuggets, and RECOST.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Higher MIWV samples exhibit more uniform distribution across the instruction space, correlating with data diversity.", 390 "evidence": "t-SNE visualization (Figure 6) shows top-5% MIWV samples spread across the embedding space while bottom-5% cluster; this is a qualitative, informal argument not supported by a quantitative diversity metric.", 391 "supported": "weak" 392 }, 393 { 394 "claim": "ICL-guided IFD score selection outperforms the original cluster-based IFD Score method.", 395 "evidence": "Table 3 shows ICL+IFD achieves overall win rate of 1.017 vs. IFD Score's 0.939 on five test sets using GPT-4 judgment on the 1% subset.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Increasing training data proportion beyond an optimal point degrades win rate due to noise and data interference.", 400 "evidence": "Figures 2–4 show win rates declining as data proportion increases beyond 10–20%, interpreted as evidence of noise; however, the models still consistently outperform the 100% baseline, and no noise analysis is performed.", 401 "supported": "weak" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "observational" 407 ], 408 "key_findings": "The paper proposes Model Instruction Weakness Value (MIWV), computed as the difference in model loss with vs. without a one-shot ICL example, to identify instruction samples that expose model weaknesses and are therefore most valuable for fine-tuning. Training LLaMA2-7B/13B on only the top 1% of MIWV-ranked Alpaca data (520 samples) consistently outperforms training on the full 52,002-sample dataset across GPT-4 pairwise evaluation and Open LLM Leaderboard benchmarks. MIWV achieves the best win rates among nine competing data selection methods while requiring only 85 minutes for selection — second-fastest after Superfiltering. Results generalize to the WizardLM dataset and Qwen2.5 model families, supporting (though not proving) the method's broader applicability.", 409 "red_flags": [ 410 { 411 "flag": "GPT-4 judge circularity", 412 "detail": "Alpaca and WizardLM datasets are generated by GPT-like APIs, yet GPT-4 is used as the primary quality judge for pairwise comparisons. This creates potential circularity: models fine-tuned to mimic GPT-style outputs will be favored by a GPT-4 judge independent of actual capability improvement." 413 }, 414 { 415 "flag": "No variance across repeated runs", 416 "detail": "Experiments are run three times and means reported, but no standard deviations are provided; statistical significance of win rate differences is never established, making it impossible to assess whether observed advantages are reliable." 417 }, 418 { 419 "flag": "Universality overclaim", 420 "detail": "The method is claimed to be 'applicable to all LLMs' and 'universal' but is tested only on four open-weight decoder-only models from two families (LLaMA/LLaMA2 and Qwen2.5); no instruction-tuned baselines, non-English models, or closed models are tested." 421 }, 422 { 423 "flag": "AlpacaEval on 5% subsample", 424 "detail": "AlpacaEval is run on only 5% of the dataset 'due to budget constraints,' severely limiting statistical reliability of this metric and making comparisons unreliable." 425 }, 426 { 427 "flag": "No limitations section", 428 "detail": "The paper contains no limitations or threats-to-validity section; conditions under which MIWV fails, edge cases in the one-shot retrieval (irrelevant nearest neighbors), or domain specificity are never discussed." 429 }, 430 { 431 "flag": "Benchmark contamination unaddressed", 432 "detail": "Open LLM Leaderboard benchmarks (ARC, HellaSwag, MMLU, TruthfulQA) predate LLaMA/LLaMA2 pretraining; no contamination analysis is performed despite these benchmarks being primary evaluation metrics." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "From Quantity to Quality: Boosting LLM Performance with Self-Guided Data Selection for Instruction Tuning (IFD Score)", 438 "relevance": "Primary competing method; MIWV is directly compared against and outperforms IFD Score in ablation and main comparison experiments" 439 }, 440 { 441 "title": "LIMA: Less Is More for Alignment", 442 "relevance": "Closely related finding that small high-quality datasets suffice for instruction tuning; provides theoretical grounding for the paper's central premise" 443 }, 444 { 445 "title": "What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning (DEITA)", 446 "relevance": "Competing method that controls both quality and diversity; directly benchmarked against MIWV in Table 2" 447 }, 448 { 449 "title": "Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning", 450 "relevance": "Competing method that achieves similar efficiency to MIWV; the only method faster than MIWV in selection time" 451 }, 452 { 453 "title": "SelectIT: Selective Instruction Tuning for Large Language Models via Uncertainty-Aware Self-Reflection", 454 "relevance": "Competing method requiring model training for selection; used in Table 2 comparison" 455 }, 456 { 457 "title": "RECOST: External Knowledge Guided Data-Efficient Instruction Tuning", 458 "relevance": "Competing method using conditional entropy and external knowledge; directly compared in Table 2" 459 }, 460 { 461 "title": "AlpaGasus: Training a Better Alpaca with Fewer Data", 462 "relevance": "Early work showing data quality selection over quantity for instruction tuning; uses ChatGPT for filtering, a key limitation compared by MIWV" 463 }, 464 { 465 "title": "Stanford Alpaca: An Instruction-Following LLaMA Model", 466 "relevance": "Primary training dataset used throughout the paper; provides the base instruction set for MIWV evaluation" 467 } 468 ], 469 "engagement_factors": { 470 "practical_relevance": { 471 "score": 3, 472 "justification": "The method claims 99% training cost reduction (1% data) with improved performance — immediately actionable for any practitioner doing LLM fine-tuning with compute constraints." 473 }, 474 "surprise_contrarian": { 475 "score": 2, 476 "justification": "The finding that 1% of data beats 100% is counterintuitive and challenges the conventional 'more data is better' assumption in fine-tuning practice." 477 }, 478 "fear_safety": { 479 "score": 0, 480 "justification": "The paper raises no AI safety concerns; it is purely a training efficiency paper." 481 }, 482 "drama_conflict": { 483 "score": 0, 484 "justification": "No controversy, retraction risk, or interpersonal conflict angle is present." 485 }, 486 "demo_ability": { 487 "score": 2, 488 "justification": "The concept is straightforward to demonstrate with LLaMA and public datasets, though no code is released to enable immediate replication." 489 }, 490 "brand_recognition": { 491 "score": 1, 492 "justification": "Alibaba Cloud and University of Tokyo are known institutions but not top-tier AI labs; the paper lacks association with a flagship product or widely-known research group." 493 } 494 }, 495 "hn_data": { 496 "threads": [ 497 { 498 "hn_id": "10641304", 499 "title": "SceneNet: Understanding Real World Scenes with Synthetic Data", 500 "points": 5, 501 "comments": 1, 502 "url": "https://news.ycombinator.com/item?id=10641304", 503 "created_at": "2015-11-28T15:35:43Z" 504 } 505 ], 506 "top_points": 5, 507 "total_points": 5, 508 "total_comments": 1 509 } 510 }