scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22956B)
      1 {
      2   "paper": {
      3     "title": "Importance-Aware Data Selection for Efficient LLM Instruction Tuning",
      4     "authors": [
      5       "Tingyu Jiang",
      6       "Shen Li",
      7       "Yiyao Song",
      8       "Lan Zhang",
      9       "Hualei Zhu",
     10       "Yuan Zhao",
     11       "Xiaohang Xu",
     12       "Kenjiro Taura",
     13       "Hao Henry Wang"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv (AAAI 2026 copyright notice)",
     17     "arxiv_id": "2511.07074",
     18     "doi": "10.48550/arXiv.2511.07074"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "The paper proposes MIWV, a metric that selects instruction tuning data by measuring loss differences with and without one-shot ICL examples. Selecting just the top 1% of data by MIWV outperforms training on the full dataset across Alpaca and WizardLM datasets on LLaMA/LLaMA2/Qwen2.5 models. The method requires no external LLM or model training for data selection, taking 85 minutes versus 120-300 for competitors. Ablation studies show random selection, high prompt loss, and low MIWV all fail to match the full-data baseline.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No repository URL or code link is provided anywhere in the paper."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses publicly available datasets (Alpaca, WizardLM, NIV2) and references them with citations. No proprietary data was collected."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Section 4.2 mentions PyTorch 2.0.1, Linux, 984GB RAM, Intel Xeon 8369B, Nvidia A100 80GB GPUs, but no requirements.txt, Dockerfile, or library version list is provided."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions are provided. The paper mentions using the 'Alpaca codebase' but provides no scripts or commands."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No confidence intervals or error bars are reported in any table or figure despite claiming experiments are repeated three times."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are used. Claims of outperformance are based solely on comparing win rates and benchmark scores."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1 and 2 report absolute benchmark scores and win rates with baseline context (e.g., win rate relative to 1.0 baseline, specific ARC/HellaSwag/MMLU/TruthfulQA scores for both baseline and selected-data models)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for why 1%, 5%, 10%, 15%, 20%, 25% data ratios were chosen, or why three repetitions are sufficient."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "The paper states 'All experiments are repeated three times with arithmetic mean results reported' (Section 4.2) but no standard deviations or variance measures are provided."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 compares against 8 methods: IFD Score, SelectIT, Superfiltering, Alpagasus, Deita, DiverseEvol, Nuggets, RECOST. Full-dataset training serves as the primary baseline."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include recent works like Deita (ICLR 2024), SelectIT (2024), Superfiltering (ACL 2024), RECOST (2024)."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.6 provides ablation on data selection strategy (random, high prompt loss, low MIWV vs. high MIWV) and on embedding model choice (Bge-en-large, Multilingual-e5-large, Gte-base-en-v1.5)."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Three evaluation approaches: Pair-wise win rate (GPT-4 judge), Open LLM Leaderboard (ARC, HellaSwag, MMLU, TruthfulQA), and AlpacaEval."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is automated. GPT-4 serves as the judge; no human evaluation of outputs is performed."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Five separate test datasets (Vicuna, Koala, WizardLM, Self-instruct, LIMA) are used, distinct from the training datasets."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 breaks down Open LLM Leaderboard into ARC, HellaSwag, MMLU, TruthfulQA. Results are shown per test set and per data ratio."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No failure analysis. The single case study (Figure 5) shows only a success. No discussion of where the method breaks down."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The ablation study in Section 4.6 explicitly shows that random selection, high prompt loss, and low MIWV all produce models worse than the full-data baseline."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims that top 1% outperforms full dataset are supported by Table 1 (win rates > 1.0 at 1%) and Figures 2a/2b."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims ('MIWV improves performance') are supported by controlled ablation: same model, same training setup, only the data selection strategy varies. The ablation design isolates the MIWV variable adequately."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper claims a 'universal data selection method' applicable to 'all LLMs' (Contribution 1), but tests only on LLaMA, LLaMA2, and Qwen2.5 model families. The title and abstract make broad claims not bounded to tested settings."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No discussion of alternative explanations. For example, the high-MIWV subset might simply have higher diversity (the t-SNE analysis hints at this) rather than the ICL mechanism being causal."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper uses GPT-4 win rate as a proxy for instruction-following quality without discussing limitations of LLM-as-judge (position bias acknowledged but deeper proxy issues unaddressed). Win rate measures GPT-4 preference, not necessarily actual capability improvement."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Models are specified as 'LLaMA-7B', 'LLaMA2-7B/13B', 'Qwen2.5-7B/14B' without exact version strings or checkpoint identifiers. GPT-4 is used as judge without specifying the API version."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 5 (Appendix F) provides the full evaluation prompt used for GPT-4 judging. The one-shot ICL prompt construction is described mathematically in Section 3."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A reports learning rate (2e-5), batch size (128), epochs (3), max input lengths (512/1024/2048). Embedding model specified as Bge-en-large."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The method is a data selection pipeline, not an agent."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Appendix A describes dataset sizes and filtering: 'adopt existing methodologies to filter samples subject to AI censure in WizardLM, resulting in 63,655 instances.' The MIWV computation pipeline is documented in Section 3."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "There is no limitations or threats-to-validity section in the paper."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No threats to validity are discussed anywhere in the paper."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No explicit scope boundaries. The paper claims universality ('applicable to all LLMs') without stating what it does NOT show."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The MIWV scores, selected subsets, and per-sample rankings are not released. Only aggregate results are shown."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Datasets are standard public benchmarks (Alpaca, WizardLM, NIV2) and their origins are described with citations."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public benchmarks."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Section 3 documents the full pipeline: embedding computation → cosine similarity retrieval → MIWV computation → ranking → subset selection. Figure 1 provides an overview diagram."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Acknowledgments section states: 'This work was supported by JST CREST Grant Number JPMJCR21M2, including the AIP Challenge Program.'"
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations clearly listed: Alibaba Cloud Computing, Independent Researcher, University of Tokyo."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "JST CREST is a Japanese government research funding agency with no commercial stake in the outcome."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement is present. Authors from Alibaba Cloud could have commercial interest in efficient training methods."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the models used (LLaMA, LLaMA2, Qwen2.5)."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether test datasets (Vicuna, Koala, etc.) overlap with the pre-training data of LLaMA/LLaMA2/Qwen2.5."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "Benchmarks like ARC, HellaSwag, MMLU, TruthfulQA were all published before LLaMA2/Qwen2.5 training. No contamination discussion."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Table 2 reports data selection time: MIWV takes 85 minutes, compared to 8-300 minutes for alternatives."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Hardware is described (A100 GPUs) but total GPU hours for training experiments are not reported. Only data selection time is given."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Paper states experiments repeated 3 times but reports only means — no per-seed results or sensitivity analysis."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Section 4.2: 'All experiments are repeated three times with arithmetic mean results reported.'"
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search budget reported. Training parameters are stated but it's unclear if they were tuned."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Multiple data ratios (1-25%) are tested but no clear validation-based selection criterion is described for choosing the best ratio."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Many comparisons across models, datasets, ratios, and test sets are made with no correction for multiple comparisons."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Authors re-implement baselines for comparison (Table 2) without acknowledging self-implementation bias."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Models trained on 1% data use far less compute than 100% baseline, but this compute advantage is not discussed or controlled for."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "GPT-4-as-judge validity is not discussed beyond noting position bias mitigation. No discussion of whether win rate measures actual capability improvement."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding involved; this is a direct fine-tuning comparison."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether LLaMA/LLaMA2/Qwen2.5 training data includes solutions to test benchmarks."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the instruction tuning data or evaluation setup leaks information."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Alpaca and WizardLM datasets were generated by LLMs; potential overlap with test sets is not discussed."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Selecting the top 1% of data by MIWV outperforms training on the full dataset.",
    375       "evidence": "Table 1 shows win rates > 1.0 for 1% subsets across LLaMA2-7B/13B on both Alpaca and WizardLM. Figure 2 shows similar results.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "MIWV outperforms 8 competing data selection methods.",
    380       "evidence": "Table 2 shows MIWV win rates of 1.119/1.211/1.178/1.234 at 1-15% ratios, higher than all competitors on the WizardLM test set.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The method is universal and applicable to all LLMs.",
    385       "evidence": "Tested on LLaMA-7B, LLaMA2-7B/13B, Qwen2.5-7B/14B, and T5-11B (Appendix D). All show improvement.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "ICL-guided data selection is more effective than IFD Score alone.",
    390       "evidence": "Table 3 shows ICL+IFD Score wins more than original IFD Score across 5 test sets, and MIWV outperforms both.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Higher MIWV samples are more diverse and cover the instruction space more uniformly.",
    395       "evidence": "Figure 6 t-SNE visualization shows high-MIWV samples spread uniformly while low-MIWV samples cluster. Figure 7 shows higher quality scores on 5 of 6 dimensions.",
    396       "supported": "weak"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No variance or error bars despite repeated experiments",
    402       "detail": "The paper states experiments are repeated 3 times but reports only means. Given the known high variance in LLM fine-tuning, the absence of any spread measure makes it impossible to assess whether differences are meaningful."
    403     },
    404     {
    405       "flag": "No statistical significance testing",
    406       "detail": "All claims of outperformance are based on raw number comparisons (win rates, benchmark scores) without any significance tests. Many differences appear small."
    407     },
    408     {
    409       "flag": "GPT-4 as sole automated judge",
    410       "detail": "Primary evaluation metric is GPT-4 pairwise comparison, known to have systematic biases. No human evaluation validates these judgments."
    411     },
    412     {
    413       "flag": "Overclaiming universality",
    414       "detail": "Claims method is 'universal' and 'applicable to all LLMs' from testing on only 3 model families (LLaMA, Qwen, T5). No limitations section acknowledges this gap."
    415     },
    416     {
    417       "flag": "No limitations section",
    418       "detail": "Complete absence of limitations, threats to validity, or scope boundaries discussion."
    419     },
    420     {
    421       "flag": "Company affiliation without conflict acknowledgment",
    422       "detail": "6 of 9 authors are from Alibaba Cloud, which has commercial interest in efficient LLM training. No competing interests statement."
    423     },
    424     {
    425       "flag": "AlpacaEval budget limitation",
    426       "detail": "Appendix B.3 admits 'Due to budget constraints, evaluations are conducted on only 5% of the dataset' for AlpacaEval, but Table 1 presents these results without caveat."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions",
    432       "authors": ["Y. Wang", "Y. Kordi", "S. Mishra"],
    433       "year": 2023,
    434       "arxiv_id": "2212.10560",
    435       "relevance": "Foundational work on instruction data generation for LLM alignment."
    436     },
    437     {
    438       "title": "AlpaGasus: Training a Better Alpaca with Fewer Data",
    439       "authors": ["L. Chen", "S. Li", "J. Yan"],
    440       "year": 2024,
    441       "relevance": "Baseline method for instruction data selection using ChatGPT filtering."
    442     },
    443     {
    444       "title": "What Makes Good Data for Alignment? A Comprehensive Study of Automatic Data Selection in Instruction Tuning",
    445       "authors": ["W. Liu", "W. Zeng", "K. He"],
    446       "year": 2024,
    447       "relevance": "Deita method — baseline for instruction data quality and diversity selection."
    448     },
    449     {
    450       "title": "From Quantity to Quality: Boosting LLM Performance with Self-Guided Data Selection for Instruction Tuning",
    451       "authors": ["M. Li", "Y. Zhang", "Z. Li"],
    452       "year": 2024,
    453       "relevance": "IFD Score method for self-guided instruction data selection, key baseline."
    454     },
    455     {
    456       "title": "Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning",
    457       "authors": ["M. Li", "Y. Zhang", "S. He"],
    458       "year": 2024,
    459       "relevance": "Efficient data filtering baseline achieving fastest selection time."
    460     },
    461     {
    462       "title": "LESS: Selecting Influential Data for Targeted Instruction Tuning",
    463       "authors": ["M. Xia", "S. Malladi", "S. Gururangan"],
    464       "year": 2024,
    465       "relevance": "Data selection via importance resampling for instruction tuning."
    466     },
    467     {
    468       "title": "LIMO: Less is More for Reasoning",
    469       "authors": ["Y. Ye", "Z. Huang", "Y. Xiao"],
    470       "year": 2025,
    471       "arxiv_id": "2502.03387",
    472       "relevance": "Recent work showing minimal high-quality data suffices for LLM reasoning."
    473     },
    474     {
    475       "title": "Recost: External knowledge guided data-efficient instruction tuning",
    476       "authors": ["Q. Zhang", "Y. Zhang", "H. Wang"],
    477       "year": 2024,
    478       "arxiv_id": "2402.17355",
    479       "relevance": "Baseline method using external knowledge for data-efficient instruction tuning."
    480     },
    481     {
    482       "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions",
    483       "authors": ["C. Xu", "Q. Sun", "K. Zheng"],
    484       "year": 2023,
    485       "arxiv_id": "2304.12244",
    486       "relevance": "Source dataset and baseline for instruction tuning evaluation."
    487     },
    488     {
    489       "title": "Lima: Less is more for alignment",
    490       "authors": ["C. Zhou", "P. Liu", "P. Xu"],
    491       "year": 2024,
    492       "relevance": "Influential work showing small curated datasets suffice for LLM alignment."
    493     }
    494   ]
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs