scan-v5.json (26284B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LaCy: What Small Language Models Can and Should Learn is Not Just a Question of Loss", 6 "authors": [ 7 "Szilvia Ujváry", 8 "Louis Béthune", 9 "Pierre Ablin", 10 "João Monteiro", 11 "Marco Cuturi", 12 "Michael Kirchhof" 13 ], 14 "year": 2026, 15 "venue": "arXiv", 16 "arxiv_id": "2602.12005", 17 "doi": null 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "All abstract claims (loss is predictive but insufficient, spaCy parser helps, LaCy achieves higher FactScores, outperforms Rho/LLM-judge) are substantiated by experiments in Sections 3, 5.2, and 5.3.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Causal claims about LaCy improving FactScore are supported by controlled pretraining experiments with multiple baselines reimplemented on identical budgets and data, plus ablations in Section 5.4.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "Experiments are confined to a single Wikipedia domain (dwiki) with GPT-2 architectures up to 1.3B, yet the paper title and framing suggest broad applicability to SLMs generally; the authors acknowledge in the conclusion that 'this study is an explorative pilot study.'", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not discuss alternative explanations for why LaCy improves FactScore—e.g., whether gains stem from call calibration, training token equalization artifacts, or the specific cascade partner rather than the token selection itself.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper clearly distinguishes between FactScore (factual precision in biography generation), fact leakage (parametric memorization measured via QA benchmarks with calling disabled), and NLU performance—each measuring a distinct aspect.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "There is no dedicated limitations section; the only limitation discussion is two sentences in the conclusion noting it is a 'pilot study' and that behavior 'is mostly because it was trained at a small scale.'", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "No specific threats to validity are enumerated; the paper does not discuss confounds such as domain specificity, the impact of cascade model quality, or the small acceptability experiment (112 documents).", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly states it addresses only 'when to call,' not 'what to do once an SLM calls for help,' and Section 2.3 devotes a subsection to explaining why the latter is out of scope.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding statement or grant acknowledgment appears anywhere in the paper; the acknowledgments only name individuals for helpful conversations.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations are clearly disclosed on the title page: Apple and University of Cambridge (intern at Apple).", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": false, 88 "justification": "The majority of authors are Apple employees and the work was conducted at Apple; the results (more efficient SLMs using external calls) directly align with Apple's commercial interest in on-device AI.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Key terms are defined: SLMs (around or below 1B parameters), acceptability (factual/logical consistency with ground truth, not exact match), FactScore, model cascade, and <CALL> token mechanism are all precisely defined in context.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper explicitly contributes LaCy, a pretraining method that combines spaCy factuality annotations with loss signals to determine <CALL> token placement, and frames it as improving over purely loss-based or LLM-judge approaches.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 provides detailed engagement with Rho-1/Rho-loss, LLM judge methods, IDK token approaches, and cascade literature, explicitly explaining how LaCy refines each prior approach rather than just listing citations.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code repository URL is provided; the paper mentions relying on a codebase 'kickstarted' by Awni Hannun but does not release it.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "The dwiki dataset is from the publicly available OLMo2 project; LLM judge annotations are cited as available at kilian-group/LMLM-pretrain-dwiki6.1M; evaluation benchmarks (BigBench, PopQA, FactScore entities) are standard public datasets.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Training hardware (8 A100-80GB GPUs), precision (bfloat16/float32), and framework choices are described, but no requirements file, Dockerfile, or complete software environment specification is provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions are provided; the appendix describes the methods but not a reproducible pipeline someone could follow without guessing implementation details.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "All main results (Figures 2, 5, 6) are reported as single point estimates with no error bars, confidence intervals, or standard deviations across runs.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are applied to any comparative claims; differences between methods are presented without testing whether they exceed chance variation.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Absolute improvements are reported—e.g., LaCy achieves 22.71% vs. baseline 15.89% FactScore (+6.82pp), and 11.28% vs. 18.45% fact leakage—providing interpretable effect magnitudes relative to baseline.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The acceptability experiment uses 112 documents (one validation batch) without justification; the 183 FactScore biography entities are inherited from the benchmark without power analysis.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No standard deviation, variance, or results across multiple training runs are reported; each method appears to have been trained once.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Four baselines are included and reimplemented with identical budgets: no-<CALL> baseline, loss-based calls, LLM judge (Zhao et al. 2025), and Rho-1 (Lin et al. 2024).", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "All baselines are recent (2022–2025) and directly comparable; they are reimplemented from scratch rather than copied from prior papers, ensuring fair comparison.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 5.4 ablates four LaCy variants: spaCy only, spaCy+Reference Model, LaCy+Ignorefacts, and LaCy+Ignore, systematically isolating contributions of the loss and factuality components.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Evaluation uses FactScore (factual precision), fact leakage (QA containment), NLU benchmarks (ARC-Easy, HellaSwag, PIQA, SIQA), and validation loss breakdowns (call/non-call/total).", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human evaluation is conducted; acceptability is judged by Gemini 2.0 Flash acting as an LLM judge, not human raters.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "10% of dwiki is reserved as validation set (Section A.4.3); FactScore is evaluated on 183 biography entities from the held-out benchmark.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results are broken down by model scale (334M vs. 1.3B), by metric type (FactScore vs. fact leakage vs. NLU), and by individual NLU benchmark in Tables 1 and 6.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 5.2 notes 'the model sometimes tries to predict factual tokens it should not' and Figure 4 explicitly shows examples of non-factual retrievals (e.g., retrieved token 'the' in wrong context).", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Section 5.7 reports that validation loss is uncorrelated with FactScore, and Section C.2/Figure 10 shows LaCy+Ignore is not beneficial once training steps are equalized—both reported as negative findings.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Exact model versions are specified: Llama 3.2 1B (Meta AI, 2024), Qwen 3 32B (Qwen Team, 2025), Gemini 2.0 Flash; GPT-2 architectures described by parameter count, dimension, heads, and layers in Table 3.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "The full acceptability judging prompt with examples is provided in Appendix A.1; the RAG prompt for Qwen 3 32B is provided in Appendix A.5; the FactScore biography prompt is described in Appendix B.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Table 4 reports batch size, total training steps, learning rate, warmup steps, and precision for all model variants; AdamW with weight decay 0.1 and no LR scheduling beyond warmup is noted.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "The cascade inference mechanism is described in detail in Appendix A.5, including tokenizer mismatch handling, the running-quantile <CALL> logit calibration, and the RAG cascade setup.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Appendix A.2.1 provides a step-by-step description of the spaCy annotation pipeline including named entity processing, supplementary entity detection, grammatical word classification, and tokenization mapping.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "dwiki is from OLMo2 (public); LLM judge annotations are cited as available at kilian-group/LMLM-pretrain-dwiki6.1M; evaluation benchmarks are standard public resources.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Section A.2 describes data sourcing (dwiki from OLMo2, 3B Wikipedia tokens), LLM judge annotation processing, and the spaCy annotation pipeline in sufficient detail.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants; all data is from public datasets and existing model annotations.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix A.2.1–A.2.2 documents the full pipeline from raw Wikipedia tokens through spaCy annotation, tokenization, and conversion of LLM judge labels to the <CALL> format used in training.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Neither the training data cutoff of Llama 3.2 1B (used as cascade partner) nor of Gemini 2.0 Flash (used as acceptability judge) is stated or discussed.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "The model is pretrained on Wikipedia (dwiki) and evaluated on FactScore (Wikipedia biography entities); potential overlap between training documents and evaluation entities is not acknowledged or addressed.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "The cascade partner Llama 3.2 1B was almost certainly pretrained on Wikipedia data that overlaps with FactScore entities; this contamination of the cascade retrieval is not discussed.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": true, 361 "justification": "Table 2 reports preprocessing overhead per 1B tokens (e.g., LaCy: 152h on single CPU core vs. LLM judge: 233h on A100 GPU); cascade inference cost is qualitatively discussed in terms of call rate (~22%).", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": true, 367 "justification": "Section A.4.1 states training completes in 3 days on 8 A100-80GB GPUs; Table 4 provides exact training steps for all model variants enabling compute estimation.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Cross-entropy loss is blind to token type: non-factual tokens with high loss are often acceptable continuations and should not trigger a <CALL>.", 376 "evidence": "Figure 3 shows that factual tokens have lower acceptability at high loss quantiles while non-factual high-loss tokens remain largely acceptable, validated via Gemini 2.0 Flash acceptability scoring on 112 documents.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "LaCy outperforms all baseline methods on FactScore for biography generation (22.71% vs. 21.63% for Rho-1).", 381 "evidence": "Figure 2 (left) shows LaCy at 22.71% vs. Rho-1 at 21.63%, LLM judge at 20.97%, loss-based at 19.44%, baseline at 15.89% for 334M model with Llama 3.2 1B cascade.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "LaCy achieves the lowest fact leakage (11.28%) compared to all baselines, confirming minimal parametric memorization of factual tokens.", 386 "evidence": "Figure 2 (right) shows LaCy fact leakage of 11.28% vs. LLM judge 14.25%, Rho-1 15.91%, loss-based 19.55%, baseline 18.45% on BigBench QA Wikidata and PopQA.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Validation loss is not correlated with FactScore in the token-selection setting, unlike standard pretraining.", 391 "evidence": "Figure 7 shows no monotonic relationship between call loss, non-call loss, or total loss and FactScore across the five compared methods, contradicting scaling law predictions.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "LaCy's spaCy-based preprocessing costs only CPU compute (152h/1B tokens on a single CPU core) versus GPU compute for LLM judge (233h on A100).", 396 "evidence": "Table 2 directly compares preprocessing overhead per 1B tokens for all methods; LaCy is the only GPU-free labeling approach other than the trivial loss-based baseline.", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Factual offloading does not significantly degrade NLU performance; LaCy matches or slightly exceeds the baseline on NLU benchmarks.", 401 "evidence": "Table 1 shows LaCy at 39.9% average NLU vs. baseline 39.6%, and LLM judge at 39.1%, with differences within ~1pp across ARC-Easy, HellaSwag, PIQA, SIQA.", 402 "supported": "moderate" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval", 407 "theoretical" 408 ], 409 "key_findings": "LaCy is a pretraining method for Small Language Models that combines spaCy grammar-based factuality detection with the model's own loss signal to identify which tokens should be delegated via a <CALL> token to a cascade model rather than learned. The core insight is that loss alone cannot distinguish factual errors from acceptable alternative continuations: high-loss non-factual tokens are often semantically valid, while high-loss factual tokens are genuinely unlearnable and dangerous. LaCy achieves higher FactScores than loss-based, Rho-1, and LLM-judge baselines while simultaneously minimizing parametric memorization of facts, and it does so with CPU-only preprocessing overhead. Crucially, the paper demonstrates that validation loss is decoupled from factual accuracy in token-selection settings, undermining the standard practice of using loss as a proxy for pretraining quality.", 410 "red_flags": [ 411 { 412 "flag": "Single domain, single dataset", 413 "detail": "All experiments use only the Wikipedia dwiki corpus; generalization to other domains is entirely untested, yet claims are framed broadly for SLMs in general." 414 }, 415 { 416 "flag": "No confidence intervals or repeated runs", 417 "detail": "All results are single point estimates from one training run per method; without variance across seeds or runs, observed differences (e.g., 22.71% vs. 21.63% FactScore) cannot be assessed for reliability." 418 }, 419 { 420 "flag": "Train/test contamination unaddressed", 421 "detail": "Models are pretrained on Wikipedia (dwiki) and evaluated on FactScore (Wikipedia biography entities); the cascade partner Llama 3.2 1B also saw Wikipedia data during pretraining—none of this overlap is acknowledged." 422 }, 423 { 424 "flag": "Acceptability experiment extremely small", 425 "detail": "The key motivating experiment (Section 3, Figure 3) uses only 112 documents (one validation batch) scored by an LLM judge, with no quantitative uncertainty reported." 426 }, 427 { 428 "flag": "Self-described as pilot study", 429 "detail": "Authors explicitly call this 'an explorative pilot study' in the conclusion, but the framing throughout the paper and abstract presents findings as general contributions without appropriate hedging." 430 }, 431 { 432 "flag": "No code release", 433 "detail": "Despite claiming reproducibility benefits and describing the pipeline in detail, no code is released, making independent replication difficult." 434 }, 435 { 436 "flag": "Undisclosed funding from interested party", 437 "detail": "Work performed at Apple with no funding disclosure; Apple has direct commercial interest in efficient SLMs for on-device use." 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "Not all tokens are what you need for pretraining (Rho-1)", 443 "relevance": "Direct baseline and prior work on token selection for pretraining efficiency" 444 }, 445 { 446 "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation", 447 "relevance": "Primary evaluation metric used throughout the paper" 448 }, 449 { 450 "title": "Pre-training large memory language models with internal and external knowledge (LMLM/LLM judge)", 451 "relevance": "Key baseline providing LLM judge annotations and the dwiki dataset" 452 }, 453 { 454 "title": "Language model cascades: Token-level uncertainty and beyond", 455 "relevance": "Foundation for the cascade inference framework used in experiments" 456 }, 457 { 458 "title": "Prioritized training on points that are learnable, worth learning, and not yet learnt (Rho-loss)", 459 "relevance": "Conceptual predecessor to loss-based token selection methods" 460 }, 461 { 462 "title": "Physics of language models: part 3.1, knowledge storage and extraction", 463 "relevance": "Motivates the claim that LLM storage is limited by parameter count" 464 }, 465 { 466 "title": "Scaling laws for neural language models", 467 "relevance": "Establishes the loss-performance correlation that LaCy shows breaks down in token-selection settings" 468 }, 469 { 470 "title": "Cascade-aware training of language models", 471 "relevance": "Prior work on training models aware of cascade inference, used as loss-based calls baseline" 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 2, 477 "justification": "SLM deployment with function calling is practically relevant to on-device AI, but the method requires full pretraining from scratch, limiting immediate adoption." 478 }, 479 "surprise_contrarian": { 480 "score": 2, 481 "justification": "The finding that validation loss is decoupled from factual accuracy in token-selection settings directly contradicts the scaling law consensus that guides most pretraining decisions." 482 }, 483 "fear_safety": { 484 "score": 0, 485 "justification": "No AI safety or risk concerns raised; the paper is narrowly focused on factual accuracy in biography generation." 486 }, 487 "drama_conflict": { 488 "score": 1, 489 "justification": "Challenges the 'train on all tokens' orthodoxy of foundation model training but does so without confrontational framing." 490 }, 491 "demo_ability": { 492 "score": 1, 493 "justification": "Biography generation examples are shown in Figure 4 but no interactive demo or released model is available for practitioners to try." 494 }, 495 "brand_recognition": { 496 "score": 2, 497 "justification": "Apple affiliation and use of Llama, Gemini, and Qwen models lend brand recognition, though the paper itself trains small custom GPT-2 architectures." 498 } 499 }, 500 "hn_data": { 501 "threads": [], 502 "top_points": 0, 503 "total_points": 0, 504 "total_comments": 0 505 } 506 }