scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (34010B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Large Language Models for Generalization and Robustness via Data Compression",
      6     "authors": [
      7       "Yucheng Li",
      8       "Yunhao Guo",
      9       "Frank Guerin",
     10       "Chenghua Lin"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2402.00861",
     15     "doi": "10.48550/arXiv.2402.00861"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about compression rate reducing after cutoff (Fig 1, §5.1), Mistral and Llama-2 showing good balance (Fig 2b, §5.2), models struggling on news and code (Table 3, §5.4), and context/tokenization impact (Tables 5-6, §5.5-5.6) are all supported by corresponding results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'further training on domain knowledge can lead to better domain capability, it may result in weaker generalization compared to the base model' based on comparing CodeLlama vs Llama-2 (§5.2). This is a causal claim from an observational comparison — these models differ in many ways beyond code training, and no controlled experiment isolates this factor.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims to evaluate 'Large Language Models' broadly, but only open-source models up to 70B are tested. Key closed-source models (GPT-4, Claude, PaLM) are excluded. Findings like 'models struggle to generalize on news and code data' (abstract) may not hold for larger or closed-source models.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss alternative explanations for its findings. For example, the post-cutoff performance decline could be driven by topic drift, vocabulary shift, or world-event novelty rather than purely data contamination effects. The arXiv stability finding is attributed to 'consistent writing styles' without considering alternatives.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly frames compression rate as a metric grounded in information theory (§2.2), discusses the theoretical equivalence between compression and prediction, and validates the proxy against established benchmarks (HumanEval, MMLU) in Table 4 and §5.3 to show correlation. The connection between compression and generalization is the paper's core contribution.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper has no dedicated limitations section. It goes from Results (§5) to Conclusion (§6) to Impact (§7), which only says 'There are many potential societal consequences of our work, none which we feel must be specifically highlighted here.'",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats to validity are discussed. There is no consideration of specific threats such as the unknown cutoff dates for most models, the limitation to English-only text evaluation, or the potential for non-temporal confounds.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what its results do not show. No discussion of excluded model types (closed-source), excluded languages (only English text analysis noted briefly in §5.6), or limits on the generalization of the proposed method.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding source or acknowledgments section is present in the paper. No mention of grants, sponsors, or funding agencies.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are listed: University of Surrey (UK), Harbin Engineering University (China), University of Manchester (UK). These are academic institutions with no direct financial interest in the evaluated models.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure means the criterion is not satisfied.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interest statement is present in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper explicitly defines 'generalization' (compression performance on post-cutoff testing period) and 'robustness' (gap between training and testing period compression rates) in the introduction and operationalizes them consistently throughout.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states it contributes a lossless compression-based evaluation framework that avoids data contamination and prompt sensitivity, with a specific temporal split methodology and 83-month dataset.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 provides substantive background engaging with prior benchmark evaluation approaches, compression-language model equivalence (Deletang et al.), and contamination literature, explaining how this work differs by using temporal splits.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper states in the abstract: 'Our data and code can be found at https://github.com/liyucheng09/llm-compressive.' A working GitHub URL is provided.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The same GitHub repository is stated to contain the data: 'Our data and code can be found at https://github.com/liyucheng09/llm-compressive.'",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided in the paper. The paper does not specify library versions or dependencies.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself does not include commands or a 'Reproducing Results' section.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table 3 and all other results report only point estimates (e.g., '7.539' compression rate). No confidence intervals, error bars, or ± notation are present anywhere in the paper.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper makes comparative claims (e.g., 'Mistral-7B achieves the most favorable balance') based solely on comparing raw compression rate numbers without any statistical significance tests.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The paper reports compression rate differences with baseline context throughout Table 3 (e.g., '↑.333' for LLaMA-7B on wikitext, meaning the 2023 rate is 0.333 higher than 2017-2022), and states 'LLaMA-65B's compression rate worsens 20% on 2023 Wikipedia data compared to 2017-2022.'",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification is provided for why 500 Wikipedia articles, 1270 BBC news articles, 395 GitHub files, etc. were chosen. The sample sizes are stated in Table 1 without rationale or power analysis.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "All results appear to be from single runs. No standard deviation, variance, or spread measures are reported across experimental runs.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 3 includes traditional compression baselines: Gzip, PNG, and FLAC. These provide a comparative perspective against the LLM-based compression approach.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "The 14 LLMs tested are all from 2023 releases. Gzip, PNG, and FLAC are established standard compression algorithms appropriate for comparison. The models represent the state of open-source LLMs at the time.",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 5.5 systematically varies context size (2K, 4K, 8K, 2K+SW) holding all else constant, measuring each component's contribution to compression performance (Table 5). Section 5.6 analyzes the tokenization component separately.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper uses compression rate as the primary metric (Table 3), plus bits-per-token (BPT) and bits-per-character (BPC) in Table 6. It also measures time and memory cost in Table 5, and compares to HumanEval pass@1 and MMLU accuracy in Table 4.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Human evaluation is irrelevant for measuring compression rates — the evaluation is entirely automated and deterministic given the model and data.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper explicitly splits data into training period (2017-2022) and testing period (2023) based on model cutoff dates. Section 5.2 states: 'we split the test data into the training period (2017-2022) and the testing period (2023).'",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 3 provides per-source breakdowns across all 6 datasets (Wikitext, BBC News, Code, arXiv, Image, Audio) for every model. Additional per-model-size comparisons are also provided.",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Section 5.4 discusses that 'large language models struggle with pure byte streams' on multimodal data, and that models face challenges with 2023 Wikipedia, news, and code. Section 5.2 notes CodeLlama's lower robustness despite better code performance.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Several negative findings are reported: all models fail on multimodal compression (§5.4), CodeLlama has weaker generalization than base Llama-2 despite better code performance (§5.2), larger vocabularies hurt token-level prediction (§5.6), and larger contexts do not outperform sliding window (§5.5).",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Table 2 specifies each model with name, release date, size, and context length (e.g., 'Llama-2, 2023-07, 7/13/70B, 4096'). For open-source models, the family name + parameter count identifies the exact checkpoint.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": false,
    242           "answer": false,
    243           "justification": "The paper does not use prompting. Models compute token-level likelihoods on raw data for compression — no prompts or instructions are sent to the models.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "The paper reports context window size (2K default, with 4K, 8K, and 2K+SW variants in §5.5), sliding window step size (512 tokens), chunk size (equal to context window), and arithmetic coding precision (32-bit). For likelihood computation, temperature/top-p are not applicable.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The method directly computes token likelihoods using LLMs and applies arithmetic coding.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 4.1 documents preprocessing for each source: Wikipedia articles monitored monthly, BBC articles from front page only, GitHub filtered to newly added or 50%+ changed files, arXiv with author info/bibliographies/appendices excluded, images converted to 64×128 grayscale patches, audio converted to 16kHz FLAC.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The GitHub repository at https://github.com/liyucheng09/llm-compressive is stated to contain both data and code, allowing independent verification of the compression results.",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 4.1 provides detailed data collection descriptions: 500 Wikipedia articles monitored monthly, 1270 BBC front-page articles per month, 75 GitHub projects with rich commit history, randomly collected arXiv papers across all disciplines, BBC news images, and BBC radio/podcast audio. Time span (Jan 2017 to Nov 2023), sizes, and selection criteria are specified.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants are involved. Data sources are public archives (Wikipedia, BBC, GitHub, arXiv) documented in §4.1.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The pipeline is documented: collect monthly data from each source → tokenize/convert to byte streams → segment into chunks of context size C → compute likelihood with language model (Eq. 3) → apply arithmetic coding (Algorithms 1-2) → measure compressed size / raw size. Each transformation step is described in §3.",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "Table 2 lists training cutoff dates where known: LLaMA ~2020 (estimated from CommonCrawl dump dates), Llama-2 September 2022. The paper acknowledges that 'many of these models have their technical reports released, limited details are shared regarding their pre-training data' (§4.2).",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "This is the paper's central methodology. Section 5.1 analyzes how model performance correlates with training data collection time, and §5.2 explicitly splits data into training period (2017-2022) and testing period (2023) to measure overlap effects.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Addressing benchmark contamination is the paper's primary motivation. Sections 1 and 2.1 extensively discuss contamination in existing benchmarks (citing 30-80% contamination rates in MMLU and SQuAD). The proposed temporal-split method is designed specifically to avoid contamination.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants are involved in this study. It evaluates LLM compression capabilities on automated datasets.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. The study uses publicly available text, code, images, and audio data.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table 5 reports inference time in seconds (e.g., 118s for Mistral-7B at 2K context) and memory usage in megabytes (e.g., 15487MB) for each model across different context sizes.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget is stated. The paper does not mention what GPU hardware was used, total GPU hours, or overall compute cost for the experiments. Table 5 provides per-model relative costs but without hardware specification.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single deterministic runs (compression is deterministic given the model, but initial conditions or batching could vary).",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "The number of experimental runs is not explicitly stated. Results are presented without indicating whether they come from single or multiple runs.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search budget is reported. The default 2K context size is used without justification, and the context size analysis in §5.5 explores a few configurations without reporting a search budget.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "The paper defaults to 2K context for all main results without justifying this choice. The context size analysis (§5.5) shows 2K+SW is best, but main results (Table 3) use 2K without explanation.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": false,
    396           "answer": false,
    397           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors propose a new evaluation method and validate it by comparing to existing benchmarks (Table 4), but do not acknowledge potential bias in evaluating their own method or discuss limitations of their own approach vs alternatives.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": true,
    409           "justification": "Table 5 reports compression rate alongside computational cost (time and memory) for different context sizes, showing that the 2K+SW approach requires ~4x compute but outperforms larger static contexts. The time/memory/performance tradeoff is explicitly discussed in §5.5.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": true,
    415           "justification": "The paper extensively discusses whether compression rate measures generalization ability (§2.2), grounding it in Shannon's information theory and citing Delétang et al. (2023) and Rae (2023). Section 5.3 validates the metric against HumanEval and MMLU, showing close rank correlation.",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No agentic scaffolding is involved. The method directly uses models for likelihood computation without any scaffolding layer.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": true,
    429           "justification": "Temporal leakage is the paper's central concern. The entire methodology splits data by model training cutoff dates, and §5.1 explicitly analyzes how performance correlates with whether data falls within or after the training period.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "Feature leakage is not explicitly discussed. While the compression evaluation setup (raw data without prompts) naturally avoids many forms of feature leakage, the paper does not explicitly address whether the evaluation setup could leak answer information.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "The paper does not discuss whether its test data sources overlap structurally with training data. For example, the 500 monitored Wikipedia articles may have earlier versions in model training data, and the same GitHub repositories could appear in training corpora.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": true,
    447           "justification": "The temporal split itself serves as a concrete leakage prevention method. Data is split by model training cutoff dates, with 2023 data used as the testing period since it postdates all models' training. This is a principled prevention strategy documented in §3 and §5.1.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Model compression performance correlates closely with training data cutoff dates, with clear divergence after the cutoff.",
    456       "evidence": "Figure 1 shows LLaMA and Llama-2 performing comparably before LLaMA's 2020 cutoff, then clearly diverging; Table 3 shows performance gap (Rate23 − Rate17-22) aligned with known cutoffs.",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "Models with similar in-distribution performance can demonstrate widely different generalization on unseen post-cutoff data.",
    461       "evidence": "Table 3 and Figure 2(b) show, e.g., LLaMA-65B's compression rate worsening 20% on 2023 Wikipedia while Mistral-7B shows only 0.115 degradation despite comparable absolute performance.",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Compression rate rankings correlate with established benchmark scores (HumanEval, MMLU).",
    466       "evidence": "Table 4 shows ordinal rank agreement between compression rates on code/arXiv and HumanEval/MMLU scores for 7 models (Spearman-style match but no formal correlation computed).",
    467       "supported": "moderate"
    468     },
    469     {
    470       "claim": "Models generalize well on arXiv data but struggle on Wikipedia, news, and code.",
    471       "evidence": "Table 3 shows arXiv results with small or negative (improving) Rate23 − Rate17-22 gaps for most models, while news and code show consistent positive gaps.",
    472       "supported": "moderate"
    473     },
    474     {
    475       "claim": "Larger context sizes improve compression with diminishing returns; 2K+sliding window outperforms static larger contexts.",
    476       "evidence": "Table 5 shows compression rate improvements from 2K→4K→8K with diminishing gains, and 2K+SW consistently achieves the best rate at 4× the compute cost.",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "All models fail to compress multimodal (image and audio) byte streams effectively.",
    481       "evidence": "Table 3 shows all LLMs with compression rates of 146–212% on images and audio, far worse than specialized codecs (PNG 64.9%, FLAC 76.3%).",
    482       "supported": "strong"
    483     },
    484     {
    485       "claim": "Mistral-7B achieves the best generalization-robustness balance among models under 7B parameters.",
    486       "evidence": "Figure 2(b) places Mistral-7B in the top-right quadrant (strong generalization, high robustness); Table 3 shows lowest BBC news compression rate and small performance gap.",
    487       "supported": "moderate"
    488     }
    489   ],
    490   "methodology_tags": [
    491     "benchmark-eval",
    492     "observational"
    493   ],
    494   "key_findings": "The paper proposes lossless data compression as a contamination-free, prompt-insensitive LLM evaluation framework, testing 14 models on 6 data sources across 83 months (2017–2023). Compression performance reliably diverges after training cutoffs, empirically confirming that standard benchmarks may conflate memorization with generalization. Domain findings show models generalize well on arXiv (consistent academic style) but degrade on Wikipedia, news, and code (time-sensitive content); all models universally fail on raw multimodal byte streams. Mistral-7B and Llama-2-70B achieve the best generalization-robustness tradeoffs in their size classes, while sliding window contexts outperform static large contexts at the cost of ~4× compute.",
    495   "red_flags": [
    496     {
    497       "flag": "No statistical testing",
    498       "detail": "All model comparisons are made without significance tests or confidence intervals; conclusions about 'best' models or domain-specific findings are based on unverified point estimates."
    499     },
    500     {
    501       "flag": "Uncertain training cutoffs",
    502       "detail": "Most models (InternLM, CodeLlama, Baichuan2, Mistral, Qwen, ChatGLM3, Yi) do not report exact training cutoffs; using 2023 as the test period assumes these models haven't seen 2023 data, which is unverified."
    503     },
    504     {
    505       "flag": "Ordinal benchmark correlation only",
    506       "detail": "The correlation between compression rates and HumanEval/MMLU scores (Table 4) is presented as rank-matching without a formal correlation coefficient or statistical measure, overstating the strength of the relationship."
    507     },
    508     {
    509       "flag": "No variance across runs",
    510       "detail": "All results are single-pass point estimates; no repeated runs or variance measures are reported, making it impossible to assess result stability."
    511     },
    512     {
    513       "flag": "Alternative explanations not ruled out",
    514       "detail": "The performance gap after cutoff could reflect distribution shift in writing styles or topic drift rather than knowledge memorization; this confound is not discussed or controlled."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "Language Modeling is Compression",
    520       "relevance": "Core theoretical foundation establishing that LLM log-likelihood minimization is equivalent to lossless compression, directly enabling this paper's evaluation framework."
    521     },
    522     {
    523       "title": "An Open Source Data Contamination Report for Large Language Models",
    524       "relevance": "Prior work by same author group quantifying contamination rates in MMLU and C-Eval, motivating the need for contamination-free evaluation."
    525     },
    526     {
    527       "title": "Data Contamination Through the Lens of Time",
    528       "relevance": "Demonstrates association between GitHub code presence and LLM pass rates, directly motivating the temporal split approach."
    529     },
    530     {
    531       "title": "Quantifying Language Models' Sensitivity to Spurious Features in Prompt Design",
    532       "relevance": "Evidence that prompt sensitivity inflates or distorts benchmark results, one of three problems motivating this compression-based alternative."
    533     },
    534     {
    535       "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks",
    536       "relevance": "Documents the benchmark contamination problem this paper aims to solve."
    537     },
    538     {
    539       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    540       "relevance": "Key benchmark used for correlation validation and cited as an example of contamination-vulnerable evaluation."
    541     },
    542     {
    543       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    544       "relevance": "Second benchmark used for correlation validation; code evaluation benchmark whose rankings are compared against compression-based code evaluation."
    545     },
    546     {
    547       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    548       "relevance": "Primary model used to demonstrate training cutoff / generalization correlation; only model with publicly documented September 2022 cutoff date."
    549     },
    550     {
    551       "title": "A Survey on Evaluation of Large Language Models",
    552       "relevance": "Survey of existing evaluation methods that frames the landscape of benchmark-based evaluation this paper critiques."
    553     },
    554     {
    555       "title": "LatestEval: Addressing Data Contamination in Language Model Evaluation through Dynamic and Time-Sensitive Test Construction",
    556       "relevance": "Related prior work by same authors on contamination-aware evaluation, showing the lineage of this approach."
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 1,
    562       "justification": "Researchers could adopt compression-based evaluation as a complement to benchmarks, but practitioners cannot directly apply this in product development."
    563     },
    564     "surprise_contrarian": {
    565       "score": 1,
    566       "justification": "The finding that performance clearly degrades after training cutoff confirms data contamination concerns rather than challenging conventional wisdom."
    567     },
    568     "fear_safety": {
    569       "score": 0,
    570       "justification": "No safety, security, or risk concerns are raised by this evaluation methodology paper."
    571     },
    572     "drama_conflict": {
    573       "score": 1,
    574       "justification": "Touches on the 'benchmarks are contaminated' narrative and shows some models generalize poorly, but stops short of making provocative claims about specific companies."
    575     },
    576     "demo_ability": {
    577       "score": 2,
    578       "justification": "Code and data are released on GitHub, allowing reproduction of compression evaluations on new models."
    579     },
    580     "brand_recognition": {
    581       "score": 1,
    582       "justification": "Evaluates well-known models (LLaMA, Mistral, CodeLlama) but authors are from universities rather than major AI labs."
    583     }
    584   },
    585   "hn_data": {
    586     "threads": [
    587       {
    588         "hn_id": "39257837",
    589         "title": "Tiny Titans: Can Smaller LLMs Punch Above Their Weight?",
    590         "points": 1,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=39257837"
    593       }
    594     ],
    595     "top_points": 1,
    596     "total_points": 1,
    597     "total_comments": 0
    598   }
    599 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs