scan.json (27043B)
1 { 2 "paper": { 3 "title": "CLASS-IT: Conversational and Lecture-Aligned Small-Scale Instruction Tuning for BabyLMs", 4 "authors": [ 5 "Luca Capone", 6 "Alessandro Bondielli", 7 "Alessandro Lenci" 8 ], 9 "year": 2025, 10 "venue": "arXiv", 11 "arxiv_id": "2510.25364" 12 }, 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No GitHub link or repository URL is provided in the paper. The dataset name 'colinglab/CLASS_IT' is mentioned but the paper states 'The full dataset will be released in the future following appropriate validation,' indicating it is not yet available." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper mentions 'colinglab/CLASS_IT' as the dataset name but explicitly states 'The full dataset will be released in the future following appropriate validation.' Only a representative portion was used and no download link is provided. Pre-training data is from the BabyLM Challenge organizers (publicly available), but their processed instruction tuning dataset is not released." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper describes model architectures and training hyperparameters but does not specify library versions, framework versions, or dependency information." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. While Tables 1 and 2 give architecture and training hyperparameters, there is no README, reproduction script, or detailed guide for replicating the experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper reports IQR (Inter-Quartile Ranges) for z-score distributions across tasks (Figures 3, 5, 6), but does not report confidence intervals or error bars for the individual task results. The bar charts in Figures 2 and 4 show point estimates without any uncertainty measure." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., 'instruction tuning yields modest but measurable gains,' 'sequential curricula outperforming merged data') but does not use any statistical significance tests. Comparisons are based on visual inspection of bar charts and z-score distributions." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": false, 50 "justification": "No formal effect sizes (Cohen's d, percentage improvements with baselines) are reported. The paper describes results qualitatively ('small but consistent gains,' 'markedly worse') without quantifying the magnitude of differences." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper tests two model sizes (100M and 140M) with multiple instruction tuning configurations but does not justify why these sizes were chosen, nor is there any discussion of whether the number of configurations or evaluation tasks provides sufficient power for their claims." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "IQR is reported for z-score distributions across tasks (Figures 3, 5, 6), but this measures variance across tasks, not across experimental runs. There is no indication that experiments were run multiple times with different seeds, and no standard deviation across runs is reported." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper compares against official BabyLM Challenge baselines: bl_gpt2-100M, bl_gptbertmixed-100M, and bl_simpo (Section 5, shown in blue in Figure 2). Pre-trained versions of their own models also serve as baselines for measuring instruction tuning effects." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The baselines are the official baselines from the BabyLM Challenge, which is the contemporary competition context for this work. These represent the current standard for this specific task and setting." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper systematically compares: (1) pre-trained only vs. instruction-tuned, (2) merged vs. sequential curriculum, (3) different orderings of sequential instruction tuning (switch_wiki vs. wiki_switch), and (4) training on individual datasets (Switchboard only, Wikipedia only). This constitutes an ablation of the instruction tuning components and strategies." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper evaluates on multiple metrics across multiple benchmarks: SuperGLUE tasks (fine-tuning accuracy on BoolQ, CB, COPA, MultiRC, RTE, WSC, QQP, MNLI), BLiMP, EWoK, WUGs, entity tracking (accuracy), and psycholinguistic correlation (change in R²). Z-scores are also computed as a global summary metric." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "This paper evaluates small-scale language models on benchmark tasks. The claims are about benchmark performance and correlations with existing human psycholinguistic data, not about output quality that requires human judgment. Human evaluation is not clearly relevant to the claims made." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 5 states 'Models are evaluated on the test set' for fine-tuning evaluation, with 'a randomly sampled 10k portion of the original training set for the task' used for fine-tuning. The zero-shot evaluation uses separate benchmark datasets. The pre-training data was split 90/10 into training and validation sets (Section 4)." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Figures 2 and 4 provide per-task breakdowns for all SuperGLUE tasks and zero-shot tasks respectively. The paper discusses performance differences across individual tasks (e.g., QQP, MNLI, WUGs, entity tracking, R² tasks) rather than reporting only aggregate numbers." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper discusses where their models fail: 100M models are 'markedly worse' on QQP and MNLI (Section 5), all models struggle on entity tracking (Section 5), instruction tuning does not consistently help on zero-shot tasks, and the Switchboard-only model performs worst. The Discussion section (Section 6) analyzes why instruction tuning may hurt zero-shot performance." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports several negative results: instruction tuning does not consistently transfer to zero-shot tasks, merged instruction tuning is worse than sequential, Switchboard-only instruction tuning performs worst, and instruction-tuned models that have seen more data do not consistently outperform pre-trained ones (Section 6)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims 'small but consistent gains in fine-tuning scenarios, with sequential curricula outperforming merged data' and that 'improvements do not consistently transfer to zero-shot tasks.' These are supported by the results in Figures 2-6 and the discussion in Sections 5-6. The claims are hedged appropriately ('small but consistent,' 'do not consistently transfer')." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims through its ablation design (instruction tuning causes improvements in fine-tuning). These are supported by controlled single-variable manipulations: comparing pre-trained vs. instruction-tuned variants, merged vs. sequential, and different orderings, while holding other variables constant. The experimental design is adequate for these causal claims." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper consistently bounds its claims to 'BabyLM-scale models,' 'small-scale LMs,' and 'ecological training limits.' The abstract and conclusion explicitly note results are about models with 100M-140M parameters trained on ~100M words. The discussion acknowledges that 'larger models have been shown to not suffer from similar issues' (Section 6), properly bounding generalization." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 6 discusses alternative explanations: the instruction tuning dataset imbalance between conversational and QA data may bias results; the evaluation criteria (classification tasks, log-likelihood) may undermine instruction-tuned models; the Switchboard corpus may be too limited in register. Section 7 also discusses how smaller models' better psycholinguistic correlation may reflect a known phenomenon rather than their specific approach." 130 } 131 }, 132 "setup_transparency": { 133 "model_versions_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper specifies 'LLaMA-3.2-3B-Instruct' for data augmentation (Section 3.2) but does not provide a snapshot date or specific version identifier. For their own models, architecture details are given in Table 1 but these are custom-trained models, not versioned API models. The LLaMA reference lacks a precise version identifier." 137 }, 138 "prompts_provided": { 139 "applies": true, 140 "answer": true, 141 "justification": "The exact prompt used for generating the Simple Wikipedia augmentation data is provided in Section 3.2: 'Based on the following text, generate 3 questions and detailed, informative answers...' This is the only prompt used in the study (for data augmentation with LLaMA-3.2-3B-Instruct)." 142 }, 143 "hyperparameters_reported": { 144 "applies": true, 145 "answer": true, 146 "justification": "Tables 1 and 2 provide detailed hyperparameters: vocab size, max length, hidden size, attention heads, layers, trainable parameters (Table 1), and learning rate, batch size, epochs, LR scheduler, warm-up steps for both pre-training and instruction tuning (Table 2). Section 4 also states 'We used the same token-level cross-entropy loss' and that loss is computed only on target tokens during instruction tuning." 147 }, 148 "scaffolding_described": { 149 "applies": false, 150 "answer": false, 151 "justification": "No agentic scaffolding is used. This is a standard pre-training and instruction tuning pipeline with no tool use, retry logic, or agent-based architecture." 152 }, 153 "data_preprocessing_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 3.1 describes pre-training data preprocessing: special character removal, discarding entries with two words or fewer, concatenation of same-speaker Switchboard utterances. Section 3.2 describes instruction tuning data processing: merging consecutive same-speaker utterances, sliding window segmentation scheme for Switchboard, and the LLaMA augmentation process for Simple Wikipedia. Word counts are provided at each stage." 157 } 158 }, 159 "limitations_and_scope": { 160 "limitations_section_present": { 161 "applies": true, 162 "answer": true, 163 "justification": "A dedicated 'Limitations' section is present after the conclusion (Section 7), discussing constraints of instruction tuning dataset size, data balance issues, and partial validation of the augmentation process." 164 }, 165 "threats_to_validity_specific": { 166 "applies": true, 167 "answer": true, 168 "justification": "The Limitations section identifies specific threats: (1) the instruction tuning dataset is small relative to pre-training, which may reduce impact; (2) conversational data is underrepresented relative to QA data, potentially biasing results; (3) the Simple Wikipedia augmentation was only partially validated. Section 7 also discusses the evaluation methodology limitation specific to their setting." 169 }, 170 "scope_boundaries_stated": { 171 "applies": true, 172 "answer": true, 173 "justification": "The paper explicitly states scope boundaries: results apply to BabyLM-scale models (100M-140M parameters), ecological training data limits (~100M words), decoder-only architectures, and the specific evaluation tasks from the BabyLM Challenge. Section 6 notes 'larger models have been shown to not suffer from similar issues' and Section 7 acknowledges the evaluation criteria may undermine their models' true capabilities." 174 } 175 }, 176 "data_integrity": { 177 "raw_data_available": { 178 "applies": true, 179 "answer": false, 180 "justification": "The pre-training data comes from the BabyLM Challenge (publicly available from organizers), but the processed instruction tuning dataset (colinglab/CLASS_IT) is stated as not yet released: 'The full dataset will be released in the future following appropriate validation.' Raw experimental results (individual task scores) are shown in figures but not provided as downloadable data." 181 }, 182 "data_collection_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 3 describes data collection in detail: pre-training data comes from the BabyLM Challenge organizers (~100M words from CHILDES, Gutenberg, BNC, OpenSubtitles, Switchboard, Simple Wikipedia). The instruction tuning data construction process is described for both Switchboard (sliding window approach, 38,802 items, ~1.3M words) and augmented Simple Wikipedia (97,697 items, 18M words)." 186 }, 187 "recruitment_methods_described": { 188 "applies": false, 189 "answer": false, 190 "justification": "No human participants were recruited. The paper uses existing corpora and benchmark datasets for model training and evaluation." 191 }, 192 "data_pipeline_documented": { 193 "applies": true, 194 "answer": true, 195 "justification": "The data pipeline is documented in Sections 3.1 and 3.2: raw data from organizers → preprocessing (special character removal, short entry filtering, speaker concatenation) → 91M words pre-training corpus. For instruction tuning: Switchboard → sliding window segmentation → 38,802 items; Simple Wikipedia → LLaMA augmentation → 97,697 items → subset selection to stay within 100M word limit. Word counts are given at each stage." 196 } 197 }, 198 "conflicts_of_interest": { 199 "funding_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "The Acknowledgments section discloses funding: PRIN 2022 Project (CUP I53D23004050006) from the Italian Ministry of University and Research (MUR), and PNRR FAIR project funded by the European Commission under NextGeneration EU." 203 }, 204 "affiliations_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Author affiliations are clearly listed: Luca Capone and Alessandro Lenci at CoLing Lab, University of Pisa (Dept. of Philology, Literature and Linguistics); Alessandro Bondielli at both the CoLing Lab and Dept. of Computer Science, University of Pisa." 208 }, 209 "funder_independent_of_outcome": { 210 "applies": true, 211 "answer": true, 212 "justification": "The funders are the Italian Ministry of University and Research and the European Commission (NextGeneration EU). These are government research funding bodies with no financial stake in the outcome of BabyLM experiments." 213 }, 214 "financial_interests_declared": { 215 "applies": true, 216 "answer": false, 217 "justification": "No competing interests or financial interests statement is present in the paper. While this is an academic paper from a university lab with government funding (low conflict risk), the absence of an explicit declaration means this criterion is not satisfied." 218 } 219 }, 220 "contamination": { 221 "training_cutoff_stated": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper uses LLaMA-3.2-3B-Instruct for data augmentation but does not state its training data cutoff date. For their own trained BabyLM models, the training data is specified but no formal cutoff is discussed relative to the evaluation benchmarks." 225 }, 226 "train_test_overlap_discussed": { 227 "applies": true, 228 "answer": false, 229 "justification": "No discussion of potential overlap between pre-training data and evaluation benchmarks. The pre-training corpus includes Gutenberg, BNC, OpenSubtitles, and CHILDES; some evaluation benchmarks (BLiMP, SuperGLUE) could have overlapping content, but this is not addressed." 230 }, 231 "benchmark_contamination_addressed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The evaluation benchmarks (BLiMP published 2020, SuperGLUE, EWoK) were available before the model's training data was assembled, but the paper does not discuss whether any benchmark examples could appear in the pre-training corpora. Additionally, LLaMA-3.2-3B-Instruct was used for data augmentation, and its potential contamination with benchmark data is not addressed." 235 } 236 }, 237 "human_studies": { 238 "pre_registered": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants in this study. It is a benchmark evaluation of language models." 242 }, 243 "irb_or_ethics_approval": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "demographics_reported": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "inclusion_exclusion_criteria": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "randomization_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "blinding_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "attrition_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 } 273 }, 274 "cost_and_practicality": { 275 "inference_cost_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "No inference cost, latency, or tokens consumed are reported for either the data augmentation with LLaMA-3.2-3B-Instruct or the evaluation of their models." 279 }, 280 "compute_budget_stated": { 281 "applies": true, 282 "answer": false, 283 "justification": "No GPU hours, hardware specifications, training time, or total compute budget is reported. The paper states word counts processed (908M words total) but does not specify the computational resources used." 284 } 285 } 286 }, 287 "claims": [ 288 { 289 "claim": "Instruction tuning yields small but consistent gains in fine-tuning scenarios for BabyLM-scale models.", 290 "evidence": "Figure 2 shows instruction-tuned models are generally competitive with or slightly above pre-trained baselines on SuperGLUE tasks. Figure 3 z-score distributions show instruction-tuned variants generally have comparable or higher medians than pre-trained models. Section 5 states 'in all cases there is at least an instruction-tuned model better than the pre-trained one.'", 291 "supported": "moderate" 292 }, 293 { 294 "claim": "Sequential curricula (presenting conversational and QA data separately) outperform merged data for instruction tuning.", 295 "evidence": "Figure 3 and Section 5 show that sequentially-trained models 'have a very similar median score, but a much smaller IQR, and are the only two models with all z-scores above zero.' Figure 6 confirms 'tuning the model sequentially on different datasets is consistently better than doing so on a mixture of the datasets.'", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "Instruction tuning improvements do not consistently transfer to zero-shot tasks.", 300 "evidence": "Figure 4 and Section 5 report 'we do not observe striking differences between pre-trained and instruction-tuned models' in zero-shot evaluation. Figure 5 z-score distribution shows 'no clear trend in favour or against instruction tuning.' Section 6 discusses this as a potential trade-off between interaction-focused adaptation and broad linguistic generalization.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "Smaller (100M) models correlate better with human psycholinguistic data than larger (140M) models.", 305 "evidence": "Section 5 reports '100 million model variants are vastly superior to both the 140 million models and the baselines' on R²-based psycholinguistic correlation tasks (Figure 4 right panel). Section 6 notes this is 'in line with previous literature where smaller models often correlate better with human psychometric data.'", 306 "supported": "moderate" 307 } 308 ], 309 "methodology_tags": [ 310 "benchmark-eval" 311 ], 312 "key_findings": "Instruction tuning of BabyLM-scale models (100M-140M parameters) provides small but measurable gains on fine-tuning tasks (SuperGLUE), particularly when conversational and QA datasets are presented sequentially rather than merged. However, these improvements do not consistently transfer to zero-shot tasks (BLiMP, EWoK, WUGs, entity tracking, psycholinguistic correlation), suggesting instruction tuning may narrow model capabilities at this scale. Smaller 100M-parameter models showed stronger correlations with human psycholinguistic data than 140M models, consistent with prior work on scaling and cognitive modeling.", 313 "red_flags": [ 314 { 315 "flag": "No statistical significance tests", 316 "detail": "All comparative claims (instruction tuning helps, sequential beats merged, etc.) are based on visual inspection of bar charts and z-score distributions without any significance testing. Given the small and inconsistent differences reported, it is unclear whether observed gains are reliable or within noise." 317 }, 318 { 319 "flag": "No multi-run variance reporting", 320 "detail": "Results appear to be from single training runs. No standard deviations across random seeds are reported. For small models where training is cheap, the absence of multi-seed experiments makes it impossible to assess result stability." 321 }, 322 { 323 "flag": "No compute or cost reporting", 324 "detail": "Despite training multiple model configurations across pre-training and instruction tuning stages, no GPU hours, hardware, or training time is reported, making it difficult to assess practical reproducibility." 325 }, 326 { 327 "flag": "Augmented data quality unvalidated", 328 "detail": "The Simple Wikipedia augmentation using LLaMA-3.2-3B-Instruct was 'only partially validated' per the Limitations section. The quality of this synthetic data directly affects the instruction tuning results but is not systematically evaluated." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "Emergent abilities of large language models", 334 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 335 "year": 2022, 336 "arxiv_id": "2206.07682", 337 "relevance": "Foundational paper on emergent abilities in LLMs, relevant to understanding scaling effects and capability emergence in AI models." 338 }, 339 { 340 "title": "Are emergent abilities of large language models a mirage?", 341 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 342 "year": 2023, 343 "relevance": "Challenges the emergent abilities narrative, arguing it may be a measurement artifact, relevant to methodology of evaluating LLM capabilities." 344 }, 345 { 346 "title": "The Llama 3 herd of models", 347 "authors": ["Abhimanyu Dubey"], 348 "year": 2024, 349 "relevance": "Describes the LLaMA model family used for data augmentation, relevant to understanding LLM architectures and their applications." 350 }, 351 { 352 "title": "Baby's CoThought: Leveraging large language models for enhanced reasoning in compact models", 353 "authors": ["Zheyu Zhang", "Han Yang", "Bolei Ma"], 354 "year": 2023, 355 "relevance": "Uses LLMs to create educational datasets for training smaller models, directly relevant to LLM-assisted training and knowledge distillation." 356 }, 357 { 358 "title": "BabyStories: Can reinforcement learning teach baby language models to write better stories?", 359 "authors": ["Xingmeng Zhao", "Tongnian Wang", "Sheri Osborn", "Anthony Rios"], 360 "year": 2023, 361 "relevance": "Explores reinforcement learning for training small language models, relevant to alternative training paradigms for LMs." 362 }, 363 { 364 "title": "Elements of World Knowledge (EWoK): A cognition-inspired framework for evaluating basic world knowledge in language models", 365 "authors": ["Anna A. Ivanova"], 366 "year": 2024, 367 "relevance": "Evaluation framework for assessing world knowledge in LMs, relevant to LLM evaluation methodology." 368 }, 369 { 370 "title": "Curriculum learning: A survey", 371 "authors": ["Petru Soviany", "Radu Tudor Ionescu", "Paolo Rota", "Nicu Sebe"], 372 "year": 2022, 373 "relevance": "Comprehensive survey of curriculum learning approaches, relevant to training methodology for AI models." 374 }, 375 { 376 "title": "Bridging the data gap between children and large language models", 377 "authors": ["Michael C. Frank"], 378 "year": 2023, 379 "relevance": "Discusses the data efficiency gap between human and machine learning, relevant to understanding LLM data requirements." 380 }, 381 { 382 "title": "Explica: Evaluating explicit causal reasoning in large language models", 383 "authors": ["Martina Miliani", "Serena Auriemma", "Alessandro Bondielli"], 384 "year": 2025, 385 "relevance": "Evaluates causal reasoning in LLMs, relevant to LLM capability assessment methodology." 386 } 387 ] 388 }