scan-v5.json (26995B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Evaluating Large Language Models for Generalization and Robustness via Data Compression", 6 "authors": [ 7 "Yucheng Li", 8 "Yunhao Guo", 9 "Frank Guerin", 10 "Chenghua Lin" 11 ], 12 "year": 2024, 13 "venue": "arXiv.org", 14 "arxiv_id": "2402.00861", 15 "doi": "10.48550/arXiv.2402.00861" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All major abstract claims — compression correlates with training cutoff, Mistral/Llama-2 balance, domain-specific generalization differences, and context/tokenization impacts — are supported by Tables 3–6 and Figures 1–2.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Claims like 'further training on domain knowledge can lead to weaker generalization' (CodeLlama vs Llama-2) are based on observational model comparisons, not controlled experiments that isolate the effect of domain fine-tuning.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "The conclusion broadly claims the method 'avoids data contamination and the potential interference of different prompts' without acknowledging scope limitations: only base models, only open-source models, only cases where cutoff dates are known.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper briefly speculates that arXiv performance is maintained 'perhaps due to consistent writing styles' but does not systematically consider alternative explanations for any observed cross-model or cross-domain patterns.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper explicitly grounds compression rate as a proxy for generalization via Shannon information theory (Section 2.2) and validates the proxy empirically by comparing model rankings against HumanEval and MMLU in Table 4.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": false, 54 "justification": "There is no limitations section. Section 7 ('Impact') only states 'none which we feel must be specifically highlighted here,' and the conclusion discusses only future work.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": false, 60 "justification": "No threats-to-validity section exists. The only specific caveat is a passing note in Section 5.6 that tokenization analysis was conducted on English data only, which 'inherently favors English models.'", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper does not explicitly state what results do NOT show — e.g., applicability only to base (non-instruction-tuned) models, only open-source models with accessible token probabilities, or only when cutoff dates are known.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding acknowledgment appears anywhere in the paper — neither in the text, footnotes, nor appendices.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are disclosed in the header: University of Surrey (Li, Guerin), Harbin Engineering University (Guo), University of Manchester (Lin).", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": false, 85 "answer": false, 86 "justification": "No funding is disclosed, so this criterion is not applicable.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests or financial interests statement appears in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Key terms are explicitly defined: 'generalization' (compression performance on post-cutoff data), 'robustness' (gap between training and testing period rates), and 'compression rate' (compressed size / raw size).", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "The contribution is clearly stated: a lossless data compression-based evaluation approach using temporal train/test splits to avoid contamination and prompt sensitivity, evaluated across 14 models and 6 data domains.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper engages substantively with prior work on benchmark contamination (Li et al. 2023c, Jacovi et al. 2023), the compression-generalization equivalence (Deletang et al. 2023), and existing evaluation frameworks (MMLU, HumanEval).", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "Source code is released at https://github.com/liyucheng09/llm-compressive, explicitly stated in the abstract.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": false, 129 "justification": "While GitHub is mentioned for 'data and code,' BBC news articles, images, and audio are collected under the ERA license which restricts redistribution beyond educational use; the full test corpus cannot be freely released.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "No requirements.txt, Dockerfile, or library dependency specifications are mentioned; only that 32-bit precision arithmetic coding was implemented.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper does not provide step-by-step reproduction instructions; it points to the GitHub repo but does not describe how to reproduce the full experimental pipeline in the paper itself.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "No confidence intervals or error bars are reported for any compression rate results in Tables 3–6 or the figures.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests are applied despite comparative claims (e.g., 'Mistral-7B achieves the most favorable balance among models under 7B').", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Absolute compression rate differences are reported with direction arrows in Table 3 (e.g., LLaMA-65B worsens by 1.10pp on Wikipedia), providing interpretable effect sizes with baseline context.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The selection of 500 Wikipedia articles, 1,270 news articles per month, 75 GitHub projects, etc. is stated but never justified with statistical rationale or power analysis.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Only mean compression rates are reported across the corpus; no variance, standard deviation, or distributional spread across documents or repeated runs is provided.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "Traditional compression algorithms (Gzip, PNG, FLAC) are included as baselines in Table 3, and results are compared against HumanEval and MMLU in Table 4.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "The LLM comparisons include contemporary 2023 models (Mistral, Llama-2, Yi, Qwen, Baichuan2, ChatGLM3); traditional compression baselines are appropriate references for compression-rate evaluation.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Context size ablations (2K, 4K, 8K, 2K+SW) are reported in Table 5, and tokenization effects across vocabulary sizes are analyzed in Table 6.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper uses compression rate as the primary metric, plus BPT (bits per token) and BPC (bits per character) for tokenization analysis in Table 6.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": false, 204 "answer": false, 205 "justification": "Human evaluation is not relevant for this automated compression-based evaluation that measures raw token probability distributions.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "The 2023 data constitutes a temporally held-out test period explicitly separated from the 2017-2022 training period; this temporal split is the paper's central methodological contribution.", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Results are broken down per domain (Wikipedia, BBC News, GitHub Code, arXiv, BBC Images, Audio-Mix) and per model in Table 3, with additional per-domain temporal visualizations in Appendix C.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "The paper explicitly discusses failure cases: all models fail on multi-modal byte streams (Section 5.4), and specific models (CodeLlama, InternLM) show steeper degradation on code data post-2023.", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Negative results are explicitly reported: all models fail to compress multi-modal data, larger static contexts do not exceed the sliding window approach, and CodeLlama's code specialization costs robustness.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Table 2 specifies model names, parameter sizes, release dates, and training cutoff dates where available; for open-source base models with single public releases, the named versions identify the weights.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": false, 242 "answer": false, 243 "justification": "No prompts are used — the method directly measures token probability distributions on raw data for arithmetic coding, bypassing prompt design entirely.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "The default 2K context window is stated upfront, context size variations are reported in Table 5, and 32-bit precision for arithmetic coding is specified in Section 3.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No agentic scaffolding is involved; models are evaluated directly via token probability distributions.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Section 4.1 documents preprocessing steps in detail: monthly Wikipedia snapshot monitoring, BBC image extraction (64×128 patches, grayscale), audio conversion (16kHz FLAC), GitHub code filtering (>50% changed), and arXiv LaTeX main-body extraction.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": false, 269 "justification": "BBC images and audio collected under ERA license (educational use only) cannot be freely redistributed, making the full raw test corpus unavailable for independent verification.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 4.1 describes data collection with specificity: 500 monitored Wikipedia articles, only front-page BBC articles, 75 popular GitHub projects with rich commit history, random arXiv papers with author/bibliography stripped.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants; data is collected from online sources.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The full pipeline is documented: collection → preprocessing → context-window chunking → per-chunk LLM probability estimation → arithmetic coding → compression rate calculation.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Table 2 lists training cutoff dates where documented (LLaMA ~2020, Llama-2 Sept 2022) and explicitly marks unknown cutoffs; the paper directly analyzes model behavior relative to these dates.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Avoiding train-test overlap is the paper's central motivation; the cutoff-based temporal split is proposed precisely to eliminate overlap, and the compression divergence after cutoffs is presented as confirmation that existing benchmarks suffer from it.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper addresses benchmark contamination as its primary motivation with quantitative evidence (30-80% contamination in MMLU/SQuAD) and proposes post-cutoff evaluation as the solution.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": true, 359 "justification": "Table 5 reports memory usage (MB) and wall-clock time (seconds) for compression across different context sizes for 5 models, providing practical cost comparisons.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "No total computational budget (GPU type, GPU hours, or estimated cost) is stated for the full experiment set across 14 models and 6 datasets spanning 83 months.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Compression performance closely correlates with models' training data cutoff date, with clear performance divergence after the cutoff.", 374 "evidence": "Figure 1 shows LLaMA and Llama-2 track identically during their shared training period (2017-2020) and diverge sharply after LLaMA's 2020 cutoff on both Wikipedia and BBC News.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Models with similar in-distribution performance can demonstrate widely different generalization on post-cutoff unseen data.", 379 "evidence": "Table 3 and Figure 2(b) show models spread across the generalization-robustness space despite similar training-period compression rates; LLaMA-65B worsens 1.10pp while Mistral-7B worsens only 0.115pp on 2023 Wikipedia.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Models struggle to generalize on news and code data post-cutoff but maintain or improve on arXiv papers.", 384 "evidence": "Table 3 shows most models' arXiv compression rates decrease (improve) in 2023, while Wikipedia, news, and code rates increase (worsen); Section 5.4 attributes this to consistent academic writing styles.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "All tested LLMs fail to compress multi-modal data (images and audio), indicating limited byte-stream generalization.", 389 "evidence": "Table 3 shows all LLMs achieve compression rates of 146–212 on image/audio data, far worse than FLAC (76–95) and PNG (36–90), making LLMs worse than dedicated domain compressors.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Context size with sliding window (2K+SW) consistently outperforms larger static contexts (4K, 8K) despite equivalent or lower memory.", 394 "evidence": "Table 5 shows 2K+SW achieves lower (better) compression rates than 4K or 8K static contexts across all 5 tested models on 2023 Wikipedia.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Compression-based evaluation correlates closely with established benchmarks HumanEval and MMLU.", 399 "evidence": "Table 4 shows near-identical model rankings on compression rate vs HumanEval (code domain) and MMLU (arXiv domain) for the 7 compared models, with Spearman rank correlation implied.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Larger vocabulary tokenizers lead to higher bits-per-token, indicating greater difficulty in token-level prediction.", 404 "evidence": "Table 6 shows Qwen (152K vocab) achieves 2.75 BPT vs Llama-2 (32K vocab) at 2.31 BPT; however, the analysis is on English-only data which inherently disadvantages multilingual tokenizers.", 405 "supported": "moderate" 406 } 407 ], 408 "methodology_tags": [ 409 "benchmark-eval", 410 "observational" 411 ], 412 "key_findings": "The paper proposes lossless data compression as a contamination-resistant, prompt-free LLM evaluation metric, using temporal train/test splits to isolate post-cutoff generalization. Testing 14 open-source base LLMs across 6 data domains (2017-2023), the paper shows compression performance clearly degrades after training cutoffs while models with similar in-distribution performance diverge significantly in generalization — Mistral-7B achieves the best performance-robustness balance among 7B models. Domain specificity is pronounced: models fail on news and code but surprisingly maintain performance on arXiv papers, and all models completely fail on multi-modal byte streams. The compression metric correlates well with HumanEval and MMLU rankings, validating it as a viable contamination-resistant alternative to standard benchmarking.", 413 "red_flags": [ 414 { 415 "flag": "No statistical significance testing", 416 "detail": "All comparative claims (e.g., 'Mistral-7B achieves the most favorable balance') are made without significance tests; compression rate differences are not assessed against noise or variability across documents." 417 }, 418 { 419 "flag": "No confidence intervals or variance", 420 "detail": "Compression rates are reported as single point estimates with no variance or standard deviation across documents, making it impossible to assess whether observed differences are meaningful." 421 }, 422 { 423 "flag": "Unknown cutoff dates for most models", 424 "detail": "Table 2 shows that InternLM, CodeLlama, Baichuan2, Mistral, Qwen, ChatGLM3, and Yi all have undocumented cutoff dates; using 2023 as the test split assumes none were trained on 2023 data, an unverified assumption central to the method's validity." 425 }, 426 { 427 "flag": "BBC data licensing restricts reproducibility", 428 "detail": "BBC news articles, images, and audio are under ERA license (educational use only), meaning the full 2 of 6 test datasets likely cannot be freely redistributed, limiting independent reproduction of results." 429 }, 430 { 431 "flag": "No limitations section", 432 "detail": "The paper explicitly declines to discuss limitations or societal impact; key scope constraints (base models only, English-centric data, accessible token probability requirement, known cutoff dependency) are never systematically acknowledged." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "Language Modeling Is Compression", 438 "relevance": "Foundational theoretical grounding — establishes compression ability as equivalent to generalization ability via information theory, directly justifying the paper's core approach." 439 }, 440 { 441 "title": "An Open Source Data Contamination Report for Large Language Models", 442 "relevance": "Key motivation — demonstrates benchmark contamination can inflate accuracy by 7-14% on MMLU and C-Eval, establishing the problem the paper aims to solve." 443 }, 444 { 445 "title": "Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design", 446 "relevance": "Motivation for prompt-free evaluation — shows models are highly sensitive to prompt formatting, justifying compression as an evaluation method that avoids prompt interference." 447 }, 448 { 449 "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination", 450 "relevance": "Related work on the contamination problem and mitigation strategies in LLM benchmark evaluation." 451 }, 452 { 453 "title": "Measuring Massive Multitask Language Understanding (MMLU)", 454 "relevance": "Used as a comparison benchmark to validate that compression rate correlates with established evaluation methods; 30-80% contamination in MMLU motivates the new approach." 455 }, 456 { 457 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 458 "relevance": "Used as a comparison benchmark for code evaluation; compression rate rankings correlate closely with HumanEval pass@1 rankings." 459 }, 460 { 461 "title": "Data Contamination Through the Lens of Time", 462 "relevance": "Related work analyzing contamination as a temporal phenomenon — finds strong association between code problem presence on GitHub and model pass rates, directly paralleling this paper's approach." 463 }, 464 { 465 "title": "LatestEval: Addressing Data Contamination in Language Model Evaluation Through Dynamic and Time-Sensitive Test Construction", 466 "relevance": "Closely related concurrent work from the same first author on time-sensitive evaluation to avoid contamination." 467 } 468 ], 469 "engagement_factors": { 470 "practical_relevance": { 471 "score": 2, 472 "justification": "Offers a concrete, implementable alternative to standard benchmark evaluation that avoids contamination — ML practitioners building evaluations could directly adopt this approach using any open-source model." 473 }, 474 "surprise_contrarian": { 475 "score": 2, 476 "justification": "Using raw compression rate as the primary model evaluation metric is counterintuitive and challenges the dominant paradigm of task-based benchmark evaluation." 477 }, 478 "fear_safety": { 479 "score": 0, 480 "justification": "No AI safety or risk concerns raised; the paper focuses on evaluation methodology rather than model capabilities or harms." 481 }, 482 "drama_conflict": { 483 "score": 1, 484 "justification": "The paper criticizes existing benchmarks as contaminated and prompt-sensitive, but frames this as a fixable methodological problem rather than a controversy." 485 }, 486 "demo_ability": { 487 "score": 2, 488 "justification": "Code is released on GitHub and the method can be applied to any open-source LLM with accessible token probabilities; technically reproducible by the community." 489 }, 490 "brand_recognition": { 491 "score": 1, 492 "justification": "Authors are from University of Surrey and University of Manchester — solid academic institutions but not famous AI labs like DeepMind, OpenAI, or Meta AI." 493 } 494 }, 495 "hn_data": { 496 "threads": [ 497 { 498 "hn_id": "39257837", 499 "title": "Tiny Titans: Can Smaller LLMs Punch Above Their Weight?", 500 "points": 1, 501 "comments": 0, 502 "url": "https://news.ycombinator.com/item?id=39257837" 503 } 504 ], 505 "top_points": 1, 506 "total_points": 1, 507 "total_comments": 0 508 } 509 }