ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26995B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Large Language Models for Generalization and Robustness via Data Compression",
      6     "authors": [
      7       "Yucheng Li",
      8       "Yunhao Guo",
      9       "Frank Guerin",
     10       "Chenghua Lin"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2402.00861",
     15     "doi": "10.48550/arXiv.2402.00861"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All major abstract claims — compression correlates with training cutoff, Mistral/Llama-2 balance, domain-specific generalization differences, and context/tokenization impacts — are supported by Tables 3–6 and Figures 1–2.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Claims like 'further training on domain knowledge can lead to weaker generalization' (CodeLlama vs Llama-2) are based on observational model comparisons, not controlled experiments that isolate the effect of domain fine-tuning.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion broadly claims the method 'avoids data contamination and the potential interference of different prompts' without acknowledging scope limitations: only base models, only open-source models, only cases where cutoff dates are known.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper briefly speculates that arXiv performance is maintained 'perhaps due to consistent writing styles' but does not systematically consider alternative explanations for any observed cross-model or cross-domain patterns.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly grounds compression rate as a proxy for generalization via Shannon information theory (Section 2.2) and validates the proxy empirically by comparing model rankings against HumanEval and MMLU in Table 4.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no limitations section. Section 7 ('Impact') only states 'none which we feel must be specifically highlighted here,' and the conclusion discusses only future work.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats-to-validity section exists. The only specific caveat is a passing note in Section 5.6 that tokenization analysis was conducted on English data only, which 'inherently favors English models.'",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what results do NOT show — e.g., applicability only to base (non-instruction-tuned) models, only open-source models with accessible token probabilities, or only when cutoff dates are known.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment appears anywhere in the paper — neither in the text, footnotes, nor appendices.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are disclosed in the header: University of Surrey (Li, Guerin), Harbin Engineering University (Guo), University of Manchester (Lin).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so this criterion is not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement appears in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are explicitly defined: 'generalization' (compression performance on post-cutoff data), 'robustness' (gap between training and testing period rates), and 'compression rate' (compressed size / raw size).",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The contribution is clearly stated: a lossless data compression-based evaluation approach using temporal train/test splits to avoid contamination and prompt sensitivity, evaluated across 14 models and 6 data domains.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper engages substantively with prior work on benchmark contamination (Li et al. 2023c, Jacovi et al. 2023), the compression-generalization equivalence (Deletang et al. 2023), and existing evaluation frameworks (MMLU, HumanEval).",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Source code is released at https://github.com/liyucheng09/llm-compressive, explicitly stated in the abstract.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "While GitHub is mentioned for 'data and code,' BBC news articles, images, and audio are collected under the ERA license which restricts redistribution beyond educational use; the full test corpus cannot be freely released.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, or library dependency specifications are mentioned; only that 32-bit precision arithmetic coding was implemented.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper does not provide step-by-step reproduction instructions; it points to the GitHub repo but does not describe how to reproduce the full experimental pipeline in the paper itself.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No confidence intervals or error bars are reported for any compression rate results in Tables 3–6 or the figures.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied despite comparative claims (e.g., 'Mistral-7B achieves the most favorable balance among models under 7B').",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute compression rate differences are reported with direction arrows in Table 3 (e.g., LLaMA-65B worsens by 1.10pp on Wikipedia), providing interpretable effect sizes with baseline context.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The selection of 500 Wikipedia articles, 1,270 news articles per month, 75 GitHub projects, etc. is stated but never justified with statistical rationale or power analysis.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Only mean compression rates are reported across the corpus; no variance, standard deviation, or distributional spread across documents or repeated runs is provided.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Traditional compression algorithms (Gzip, PNG, FLAC) are included as baselines in Table 3, and results are compared against HumanEval and MMLU in Table 4.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The LLM comparisons include contemporary 2023 models (Mistral, Llama-2, Yi, Qwen, Baichuan2, ChatGLM3); traditional compression baselines are appropriate references for compression-rate evaluation.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Context size ablations (2K, 4K, 8K, 2K+SW) are reported in Table 5, and tokenization effects across vocabulary sizes are analyzed in Table 6.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper uses compression rate as the primary metric, plus BPT (bits per token) and BPC (bits per character) for tokenization analysis in Table 6.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is not relevant for this automated compression-based evaluation that measures raw token probability distributions.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The 2023 data constitutes a temporally held-out test period explicitly separated from the 2017-2022 training period; this temporal split is the paper's central methodological contribution.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down per domain (Wikipedia, BBC News, GitHub Code, arXiv, BBC Images, Audio-Mix) and per model in Table 3, with additional per-domain temporal visualizations in Appendix C.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "The paper explicitly discusses failure cases: all models fail on multi-modal byte streams (Section 5.4), and specific models (CodeLlama, InternLM) show steeper degradation on code data post-2023.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Negative results are explicitly reported: all models fail to compress multi-modal data, larger static contexts do not exceed the sliding window approach, and CodeLlama's code specialization costs robustness.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 2 specifies model names, parameter sizes, release dates, and training cutoff dates where available; for open-source base models with single public releases, the named versions identify the weights.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "No prompts are used — the method directly measures token probability distributions on raw data for arithmetic coding, bypassing prompt design entirely.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "The default 2K context window is stated upfront, context size variations are reported in Table 5, and 32-bit precision for arithmetic coding is specified in Section 3.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is involved; models are evaluated directly via token probability distributions.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4.1 documents preprocessing steps in detail: monthly Wikipedia snapshot monitoring, BBC image extraction (64×128 patches, grayscale), audio conversion (16kHz FLAC), GitHub code filtering (>50% changed), and arXiv LaTeX main-body extraction.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "BBC images and audio collected under ERA license (educational use only) cannot be freely redistributed, making the full raw test corpus unavailable for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.1 describes data collection with specificity: 500 monitored Wikipedia articles, only front-page BBC articles, 75 popular GitHub projects with rich commit history, random arXiv papers with author/bibliography stripped.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; data is collected from online sources.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline is documented: collection → preprocessing → context-window chunking → per-chunk LLM probability estimation → arithmetic coding → compression rate calculation.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "Table 2 lists training cutoff dates where documented (LLaMA ~2020, Llama-2 Sept 2022) and explicitly marks unknown cutoffs; the paper directly analyzes model behavior relative to these dates.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Avoiding train-test overlap is the paper's central motivation; the cutoff-based temporal split is proposed precisely to eliminate overlap, and the compression divergence after cutoffs is presented as confirmation that existing benchmarks suffer from it.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "The paper addresses benchmark contamination as its primary motivation with quantitative evidence (30-80% contamination in MMLU/SQuAD) and proposes post-cutoff evaluation as the solution.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table 5 reports memory usage (MB) and wall-clock time (seconds) for compression across different context sizes for 5 models, providing practical cost comparisons.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget (GPU type, GPU hours, or estimated cost) is stated for the full experiment set across 14 models and 6 datasets spanning 83 months.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Compression performance closely correlates with models' training data cutoff date, with clear performance divergence after the cutoff.",
    374       "evidence": "Figure 1 shows LLaMA and Llama-2 track identically during their shared training period (2017-2020) and diverge sharply after LLaMA's 2020 cutoff on both Wikipedia and BBC News.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Models with similar in-distribution performance can demonstrate widely different generalization on post-cutoff unseen data.",
    379       "evidence": "Table 3 and Figure 2(b) show models spread across the generalization-robustness space despite similar training-period compression rates; LLaMA-65B worsens 1.10pp while Mistral-7B worsens only 0.115pp on 2023 Wikipedia.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Models struggle to generalize on news and code data post-cutoff but maintain or improve on arXiv papers.",
    384       "evidence": "Table 3 shows most models' arXiv compression rates decrease (improve) in 2023, while Wikipedia, news, and code rates increase (worsen); Section 5.4 attributes this to consistent academic writing styles.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "All tested LLMs fail to compress multi-modal data (images and audio), indicating limited byte-stream generalization.",
    389       "evidence": "Table 3 shows all LLMs achieve compression rates of 146–212 on image/audio data, far worse than FLAC (76–95) and PNG (36–90), making LLMs worse than dedicated domain compressors.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Context size with sliding window (2K+SW) consistently outperforms larger static contexts (4K, 8K) despite equivalent or lower memory.",
    394       "evidence": "Table 5 shows 2K+SW achieves lower (better) compression rates than 4K or 8K static contexts across all 5 tested models on 2023 Wikipedia.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Compression-based evaluation correlates closely with established benchmarks HumanEval and MMLU.",
    399       "evidence": "Table 4 shows near-identical model rankings on compression rate vs HumanEval (code domain) and MMLU (arXiv domain) for the 7 compared models, with Spearman rank correlation implied.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "Larger vocabulary tokenizers lead to higher bits-per-token, indicating greater difficulty in token-level prediction.",
    404       "evidence": "Table 6 shows Qwen (152K vocab) achieves 2.75 BPT vs Llama-2 (32K vocab) at 2.31 BPT; however, the analysis is on English-only data which inherently disadvantages multilingual tokenizers.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "observational"
    411   ],
    412   "key_findings": "The paper proposes lossless data compression as a contamination-resistant, prompt-free LLM evaluation metric, using temporal train/test splits to isolate post-cutoff generalization. Testing 14 open-source base LLMs across 6 data domains (2017-2023), the paper shows compression performance clearly degrades after training cutoffs while models with similar in-distribution performance diverge significantly in generalization — Mistral-7B achieves the best performance-robustness balance among 7B models. Domain specificity is pronounced: models fail on news and code but surprisingly maintain performance on arXiv papers, and all models completely fail on multi-modal byte streams. The compression metric correlates well with HumanEval and MMLU rankings, validating it as a viable contamination-resistant alternative to standard benchmarking.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance testing",
    416       "detail": "All comparative claims (e.g., 'Mistral-7B achieves the most favorable balance') are made without significance tests; compression rate differences are not assessed against noise or variability across documents."
    417     },
    418     {
    419       "flag": "No confidence intervals or variance",
    420       "detail": "Compression rates are reported as single point estimates with no variance or standard deviation across documents, making it impossible to assess whether observed differences are meaningful."
    421     },
    422     {
    423       "flag": "Unknown cutoff dates for most models",
    424       "detail": "Table 2 shows that InternLM, CodeLlama, Baichuan2, Mistral, Qwen, ChatGLM3, and Yi all have undocumented cutoff dates; using 2023 as the test split assumes none were trained on 2023 data, an unverified assumption central to the method's validity."
    425     },
    426     {
    427       "flag": "BBC data licensing restricts reproducibility",
    428       "detail": "BBC news articles, images, and audio are under ERA license (educational use only), meaning the full 2 of 6 test datasets likely cannot be freely redistributed, limiting independent reproduction of results."
    429     },
    430     {
    431       "flag": "No limitations section",
    432       "detail": "The paper explicitly declines to discuss limitations or societal impact; key scope constraints (base models only, English-centric data, accessible token probability requirement, known cutoff dependency) are never systematically acknowledged."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Language Modeling Is Compression",
    438       "relevance": "Foundational theoretical grounding — establishes compression ability as equivalent to generalization ability via information theory, directly justifying the paper's core approach."
    439     },
    440     {
    441       "title": "An Open Source Data Contamination Report for Large Language Models",
    442       "relevance": "Key motivation — demonstrates benchmark contamination can inflate accuracy by 7-14% on MMLU and C-Eval, establishing the problem the paper aims to solve."
    443     },
    444     {
    445       "title": "Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design",
    446       "relevance": "Motivation for prompt-free evaluation — shows models are highly sensitive to prompt formatting, justifying compression as an evaluation method that avoids prompt interference."
    447     },
    448     {
    449       "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination",
    450       "relevance": "Related work on the contamination problem and mitigation strategies in LLM benchmark evaluation."
    451     },
    452     {
    453       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    454       "relevance": "Used as a comparison benchmark to validate that compression rate correlates with established evaluation methods; 30-80% contamination in MMLU motivates the new approach."
    455     },
    456     {
    457       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    458       "relevance": "Used as a comparison benchmark for code evaluation; compression rate rankings correlate closely with HumanEval pass@1 rankings."
    459     },
    460     {
    461       "title": "Data Contamination Through the Lens of Time",
    462       "relevance": "Related work analyzing contamination as a temporal phenomenon — finds strong association between code problem presence on GitHub and model pass rates, directly paralleling this paper's approach."
    463     },
    464     {
    465       "title": "LatestEval: Addressing Data Contamination in Language Model Evaluation Through Dynamic and Time-Sensitive Test Construction",
    466       "relevance": "Closely related concurrent work from the same first author on time-sensitive evaluation to avoid contamination."
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 2,
    472       "justification": "Offers a concrete, implementable alternative to standard benchmark evaluation that avoids contamination — ML practitioners building evaluations could directly adopt this approach using any open-source model."
    473     },
    474     "surprise_contrarian": {
    475       "score": 2,
    476       "justification": "Using raw compression rate as the primary model evaluation metric is counterintuitive and challenges the dominant paradigm of task-based benchmark evaluation."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No AI safety or risk concerns raised; the paper focuses on evaluation methodology rather than model capabilities or harms."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "The paper criticizes existing benchmarks as contaminated and prompt-sensitive, but frames this as a fixable methodological problem rather than a controversy."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Code is released on GitHub and the method can be applied to any open-source LLM with accessible token probabilities; technically reproducible by the community."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "Authors are from University of Surrey and University of Manchester — solid academic institutions but not famous AI labs like DeepMind, OpenAI, or Meta AI."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "39257837",
    499         "title": "Tiny Titans: Can Smaller LLMs Punch Above Their Weight?",
    500         "points": 1,
    501         "comments": 0,
    502         "url": "https://news.ycombinator.com/item?id=39257837"
    503       }
    504     ],
    505     "top_points": 1,
    506     "total_points": 1,
    507     "total_comments": 0
    508   }
    509 }

Impressum · Datenschutz