scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32217B)
      1 {
      2   "paper": {
      3     "title": "Evaluating Large Language Models for Generalization and Robustness via Data Compression",
      4     "authors": ["Yucheng Li", "Yunhao Guo", "Frank Guerin", "Chenghua Lin"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2402.00861",
      8     "doi": "10.48550/arXiv.2402.00861"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Lossless data compression is proposed as a contamination-resistant evaluation method for LLMs, using temporal splits around model training cutoffs. Across 14 models and 6 data sources spanning 83 months, compression performance degrades noticeably after cutoff dates, with clear divergence patterns between LLaMA and Llama-2. Models struggle to generalize on news and code but maintain performance on arXiv papers, and all models fail to compress raw multimodal byte streams. A 2K context with sliding window consistently outperforms larger static contexts.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states in the abstract: 'Our data and code can be found at https://github.com/liyucheng09/llm-compressive.' A working GitHub URL is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The same GitHub repository is stated to contain the data: 'Our data and code can be found at https://github.com/liyucheng09/llm-compressive.'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided in the paper. The paper does not specify library versions or dependencies."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself does not include commands or a 'Reproducing Results' section."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Table 3 and all other results report only point estimates (e.g., '7.539' compression rate). No confidence intervals, error bars, or ± notation are present anywhere in the paper."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes comparative claims (e.g., 'Mistral-7B achieves the most favorable balance') based solely on comparing raw compression rate numbers without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports compression rate differences with baseline context throughout Table 3 (e.g., '↑.333' for LLaMA-7B on wikitext, meaning the 2023 rate is 0.333 higher than 2017-2022), and states 'LLaMA-65B's compression rate worsens 20% on 2023 Wikipedia data compared to 2017-2022.'"
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is provided for why 500 Wikipedia articles, 1270 BBC news articles, 395 GitHub files, etc. were chosen. The sample sizes are stated in Table 1 without rationale or power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results appear to be from single runs. No standard deviation, variance, or spread measures are reported across experimental runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 3 includes traditional compression baselines: Gzip, PNG, and FLAC. These provide a comparative perspective against the LLM-based compression approach."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The 14 LLMs tested are all from 2023 releases. Gzip, PNG, and FLAC are established standard compression algorithms appropriate for comparison. The models represent the state of open-source LLMs at the time."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5.5 systematically varies context size (2K, 4K, 8K, 2K+SW) holding all else constant, measuring each component's contribution to compression performance (Table 5). Section 5.6 analyzes the tokenization component separately."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses compression rate as the primary metric (Table 3), plus bits-per-token (BPT) and bits-per-character (BPC) in Table 6. It also measures time and memory cost in Table 5, and compares to HumanEval pass@1 and MMLU accuracy in Table 4."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant for measuring compression rates — the evaluation is entirely automated and deterministic given the model and data."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper explicitly splits data into training period (2017-2022) and testing period (2023) based on model cutoff dates. Section 5.2 states: 'we split the test data into the training period (2017-2022) and the testing period (2023).'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 provides per-source breakdowns across all 6 datasets (Wikitext, BBC News, Code, arXiv, Image, Audio) for every model. Additional per-model-size comparisons are also provided."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.4 discusses that 'large language models struggle with pure byte streams' on multimodal data, and that models face challenges with 2023 Wikipedia, news, and code. Section 5.2 notes CodeLlama's lower robustness despite better code performance."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative findings are reported: all models fail on multimodal compression (§5.4), CodeLlama has weaker generalization than base Llama-2 despite better code performance (§5.2), larger vocabularies hurt token-level prediction (§5.6), and larger contexts do not outperform sliding window (§5.5)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about compression rate reducing after cutoff (Fig 1, §5.1), Mistral and Llama-2 showing good balance (Fig 2b, §5.2), models struggling on news and code (Table 3, §5.4), and context/tokenization impact (Tables 5-6, §5.5-5.6) are all supported by corresponding results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper states 'further training on domain knowledge can lead to better domain capability, it may result in weaker generalization compared to the base model' based on comparing CodeLlama vs Llama-2 (§5.2). This is a causal claim from an observational comparison — these models differ in many ways beyond code training, and no controlled experiment isolates this factor."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims to evaluate 'Large Language Models' broadly, but only open-source models up to 70B are tested. Key closed-source models (GPT-4, Claude, PaLM) are excluded. Findings like 'models struggle to generalize on news and code data' (abstract) may not hold for larger or closed-source models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for its findings. For example, the post-cutoff performance decline could be driven by topic drift, vocabulary shift, or world-event novelty rather than purely data contamination effects. The arXiv stability finding is attributed to 'consistent writing styles' without considering alternatives."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly frames compression rate as a metric grounded in information theory (§2.2), discusses the theoretical equivalence between compression and prediction, and validates the proxy against established benchmarks (HumanEval, MMLU) in Table 4 and §5.3 to show correlation. The connection between compression and generalization is the paper's core contribution."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 2 specifies each model with name, release date, size, and context length (e.g., 'Llama-2, 2023-07, 7/13/70B, 4096'). For open-source models, the family name + parameter count identifies the exact checkpoint."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. Models compute token-level likelihoods on raw data for compression — no prompts or instructions are sent to the models."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper reports context window size (2K default, with 4K, 8K, and 2K+SW variants in §5.5), sliding window step size (512 tokens), chunk size (equal to context window), and arithmetic coding precision (32-bit). For likelihood computation, temperature/top-p are not applicable."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The method directly computes token likelihoods using LLMs and applies arithmetic coding."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4.1 documents preprocessing for each source: Wikipedia articles monitored monthly, BBC articles from front page only, GitHub filtered to newly added or 50%+ changed files, arXiv with author info/bibliographies/appendices excluded, images converted to 64×128 grayscale patches, audio converted to 16kHz FLAC."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper has no dedicated limitations section. It goes from Results (§5) to Conclusion (§6) to Impact (§7), which only says 'There are many potential societal consequences of our work, none which we feel must be specifically highlighted here.'"
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. There is no consideration of specific threats such as the unknown cutoff dates for most models, the limitation to English-only text evaluation, or the potential for non-temporal confounds."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what its results do not show. No discussion of excluded model types (closed-source), excluded languages (only English text analysis noted briefly in §5.6), or limits on the generalization of the proposed method."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The GitHub repository at https://github.com/liyucheng09/llm-compressive is stated to contain both data and code, allowing independent verification of the compression results."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.1 provides detailed data collection descriptions: 500 Wikipedia articles monitored monthly, 1270 BBC front-page articles per month, 75 GitHub projects with rich commit history, randomly collected arXiv papers across all disciplines, BBC news images, and BBC radio/podcast audio. Time span (Jan 2017 to Nov 2023), sizes, and selection criteria are specified."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. Data sources are public archives (Wikipedia, BBC, GitHub, arXiv) documented in §4.1."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: collect monthly data from each source → tokenize/convert to byte streams → segment into chunks of context size C → compute likelihood with language model (Eq. 3) → apply arithmetic coding (Algorithms 1-2) → measure compressed size / raw size. Each transformation step is described in §3."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper. No mention of grants, sponsors, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: University of Surrey (UK), Harbin Engineering University (China), University of Manchester (UK). These are academic institutions with no direct financial interest in the evaluated models."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure means the criterion is not satisfied."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interest statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Table 2 lists training cutoff dates where known: LLaMA ~2020 (estimated from CommonCrawl dump dates), Llama-2 September 2022. The paper acknowledges that 'many of these models have their technical reports released, limited details are shared regarding their pre-training data' (§4.2)."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "This is the paper's central methodology. Section 5.1 analyzes how model performance correlates with training data collection time, and §5.2 explicitly splits data into training period (2017-2022) and testing period (2023) to measure overlap effects."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Addressing benchmark contamination is the paper's primary motivation. Sections 1 and 2.1 extensively discuss contamination in existing benchmarks (citing 30-80% contamination rates in MMLU and SQuAD). The proposed temporal-split method is designed specifically to avoid contamination."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study. It evaluates LLM compression capabilities on automated datasets."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study uses publicly available text, code, images, and audio data."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 5 reports inference time in seconds (e.g., 118s for Mistral-7B at 2K context) and memory usage in megabytes (e.g., 15487MB) for each model across different context sizes."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. The paper does not mention what GPU hardware was used, total GPU hours, or overall compute cost for the experiments. Table 5 provides per-model relative costs but without hardware specification."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single deterministic runs (compression is deterministic given the model, but initial conditions or batching could vary)."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not explicitly stated. Results are presented without indicating whether they come from single or multiple runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The default 2K context size is used without justification, and the context size analysis in §5.5 explores a few configurations without reporting a search budget."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper defaults to 2K context for all main results without justifying this choice. The context size analysis (§5.5) shows 2K+SW is best, but main results (Table 3) use 2K without explanation."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose a new evaluation method and validate it by comparing to existing benchmarks (Table 4), but do not acknowledge potential bias in evaluating their own method or discuss limitations of their own approach vs alternatives."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Table 5 reports compression rate alongside computational cost (time and memory) for different context sizes, showing that the 2K+SW approach requires ~4x compute but outperforms larger static contexts. The time/memory/performance tradeoff is explicitly discussed in §5.5."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper extensively discusses whether compression rate measures generalization ability (§2.2), grounding it in Shannon's information theory and citing Delétang et al. (2023) and Rae (2023). Section 5.3 validates the metric against HumanEval and MMLU, showing close rank correlation."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved. The method directly uses models for likelihood computation without any scaffolding layer."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Temporal leakage is the paper's central concern. The entire methodology splits data by model training cutoff dates, and §5.1 explicitly analyzes how performance correlates with whether data falls within or after the training period."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Feature leakage is not explicitly discussed. While the compression evaluation setup (raw data without prompts) naturally avoids many forms of feature leakage, the paper does not explicitly address whether the evaluation setup could leak answer information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper does not discuss whether its test data sources overlap structurally with training data. For example, the 500 monitored Wikipedia articles may have earlier versions in model training data, and the same GitHub repositories could appear in training corpora."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The temporal split itself serves as a concrete leakage prevention method. Data is split by model training cutoff dates, with 2023 data used as the testing period since it postdates all models' training. This is a principled prevention strategy documented in §3 and §5.1."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Models' compression performance over time correlates closely with their training data collection cutoff, with clear divergences after the cutoff date.",
    365       "evidence": "Figure 1 shows LLaMA's compression rate worsening rapidly after its 2020 cutoff, while Llama-2 maintains performance until its September 2022 cutoff before also declining. Demonstrated on both wikitext and BBC news datasets (§5.1).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Models with similar performance on training-period data can demonstrate widely different generalization results on new, unseen data.",
    370       "evidence": "Table 3 shows models with comparable average rates diverge significantly on 2023 data. LLaMA-65B's compression rate worsens 20% on 2023 Wikipedia vs 2017-2022 (§5.2). Figure 2(b) visualizes the performance-robustness tradeoff space.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Models struggle to generalize on wikitext, news, and code data but generalize well on arXiv papers.",
    375       "evidence": "Table 3 shows all models have worse (higher) compression rates on 2023 wikitext, news, and code data (↑ arrows), but maintain or improve on arXiv (↓ arrows). The paper attributes this to 'consistent writing styles in academic papers' (§5.4).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "All models fail to compress multimodal data (images and audio).",
    380       "evidence": "Table 3 shows compression rates of 150-210% for images and audio, far worse than traditional compressors like PNG (64%) and FLAC (76%). The paper notes 'limited capabilities in dealing with raw byte streams' (§5.4).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "A 2K context with sliding window consistently outperforms larger static context sizes (4K, 8K).",
    385       "evidence": "Table 5 shows 2K+SW achieves the lowest compression rates across all tested models (e.g., Mistral-7B: 7.385% vs 7.462% for 8K), at the cost of approximately 4x compute (§5.5).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Compression performance correlates closely with established benchmark rankings (HumanEval, MMLU).",
    390       "evidence": "Table 4 shows model rankings on the paper's code compression metric closely match HumanEval rankings, and arXiv compression ranks match MMLU rankings (§5.3). However, this is based on rank correlation across only 7 models.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Models with larger vocabularies have more difficulty with token-level prediction despite better pre-compression.",
    395       "evidence": "Table 6 shows Qwen (152K vocab) achieves higher BPT (2.75) than Llama-2 (32K vocab, BPT 2.31) while having fewer total tokens. BPC comparison is closer, suggesting the token-level difficulty partially offsets the pre-compression benefit (§5.6).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or uncertainty quantification",
    402       "detail": "All results across Tables 3-6 and Figures 1-5 are single-point estimates with no confidence intervals, standard deviations, or significance tests. For claims comparing models (e.g., 'Mistral-7B achieves the most favorable balance'), the observed differences may not be statistically meaningful."
    403     },
    404     {
    405       "flag": "No limitations section",
    406       "detail": "The paper has no dedicated limitations or threats-to-validity section. Key limitations like unknown cutoff dates for most models, restriction to English text, exclusion of closed-source models, and potential Wikipedia version overlap with training data are not discussed."
    407     },
    408     {
    409       "flag": "Unknown training cutoffs for most models",
    410       "detail": "Table 2 shows cutoff dates for only LLaMA (~2020, estimated) and Llama-2 (September 2022). The remaining 10 model configurations have no known cutoff, yet 2023 is used as the universal test period. If some models' training data extends into 2023, the temporal split is compromised."
    411     },
    412     {
    413       "flag": "Missing hardware specification",
    414       "detail": "No GPU type or hardware specification is mentioned. Table 5 reports time and memory but without anchoring to specific hardware, the cost numbers are not reproducible or comparable."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Language modeling is compression",
    420       "authors": ["G. Delétang", "A. Ruoss", "P.-A. Duquenne", "E. Catt", "T. Genewein", "C. Mattern", "J. Grau-Moya", "L. K. Wenliang", "M. Aitchison", "L. Orseau"],
    421       "year": 2023,
    422       "arxiv_id": "2309.10668",
    423       "relevance": "Establishes the theoretical equivalence between language modeling and compression, the foundation of this paper's evaluation method."
    424     },
    425     {
    426       "title": "An open source data contamination report for large language models",
    427       "authors": ["Y. Li", "F. Guerin", "C. Lin"],
    428       "year": 2023,
    429       "arxiv_id": "2310.17589",
    430       "relevance": "Shows benchmark contamination can inflate accuracy by 7-14% on MMLU and C-Eval, motivating the need for contamination-resistant evaluation."
    431     },
    432     {
    433       "title": "Data contamination through the lens of time",
    434       "authors": ["M. Roberts", "H. Thakur", "C. Herlihy", "C. White", "S. Dooley"],
    435       "year": 2023,
    436       "arxiv_id": "2310.10628",
    437       "relevance": "Reveals significant association between a code problem's presence on GitHub and a model's pass rate, directly relevant to temporal contamination analysis."
    438     },
    439     {
    440       "title": "Quantifying language models' sensitivity to spurious features in prompt design",
    441       "authors": ["M. Sclar", "Y. Choi", "Y. Tsvetkov", "A. Suhr"],
    442       "year": 2023,
    443       "arxiv_id": "2310.11324",
    444       "relevance": "Demonstrates LLMs' high sensitivity to prompt design, motivating prompt-free evaluation methods like compression-based approaches."
    445     },
    446     {
    447       "title": "Evaluating large language models trained on code",
    448       "authors": ["M. Chen", "J. Tworek", "H. Jun", "Q. Yuan"],
    449       "year": 2021,
    450       "arxiv_id": "2107.03374",
    451       "relevance": "Introduces HumanEval benchmark for code generation, used as a comparison point for validating the compression-based evaluation method."
    452     },
    453     {
    454       "title": "Measuring massive multitask language understanding",
    455       "authors": ["D. Hendrycks", "C. Burns", "S. Basart", "A. Zou", "M. Mazeika", "D. Song", "J. Steinhardt"],
    456       "year": 2020,
    457       "arxiv_id": "2009.03300",
    458       "relevance": "Introduces MMLU, a major LLM evaluation benchmark that is shown to have 30-80% contamination rates and is used for validation comparison."
    459     },
    460     {
    461       "title": "Language models are few-shot learners",
    462       "authors": ["T. Brown", "B. Mann", "N. Ryder", "M. Subbiah"],
    463       "year": 2020,
    464       "relevance": "GPT-3 paper reporting that Wikipedia-based benchmarks are almost entirely contained in training data, demonstrating the contamination problem at scale."
    465     },
    466     {
    467       "title": "Stop uploading test data in plain text: Practical strategies for mitigating data contamination by evaluation benchmarks",
    468       "authors": ["A. Jacovi", "A. Caciularu", "O. Goldman", "Y. Goldberg"],
    469       "year": 2023,
    470       "arxiv_id": "2305.10160",
    471       "relevance": "Proposes strategies to mitigate benchmark contamination, directly relevant to the survey's focus on evaluation methodology quality."
    472     },
    473     {
    474       "title": "LLMZip: Lossless text compression using large language models",
    475       "authors": ["C. S. K. Valmeekam", "K. Narayanan", "D. Kalathil", "J.-F. Chamberland", "S. Shakkottai"],
    476       "year": 2023,
    477       "arxiv_id": "2306.04050",
    478       "relevance": "Demonstrates that LLMs can achieve state-of-the-art lossless compression via arithmetic coding, directly relevant to compression-based evaluation."
    479     },
    480     {
    481       "title": "A survey on evaluation of large language models",
    482       "authors": ["Y. Chang", "X. Wang", "J. Wang", "Y. Wu", "L. Yang", "K. Zhu"],
    483       "year": 2023,
    484       "relevance": "Comprehensive survey on LLM evaluation methods, providing context for the challenges in current evaluation approaches that this paper addresses."
    485     },
    486     {
    487       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    488       "authors": ["O. Sainz", "J. A. Campos", "I. García-Ferrero", "J. Etxaniz", "O. L. de Lacalle", "E. Agirre"],
    489       "year": 2023,
    490       "arxiv_id": "2310.18018",
    491       "relevance": "Argues for per-benchmark contamination measurement, highlighting the contamination crisis in LLM evaluation that this paper aims to address."
    492     }
    493   ],
    494   "engagement_factors": {
    495     "practical_relevance": {
    496       "score": 1,
    497       "justification": "Researchers could adopt compression-based evaluation as a complement to benchmarks, but practitioners cannot directly apply this in product development."
    498     },
    499     "surprise_contrarian": {
    500       "score": 1,
    501       "justification": "The finding that performance clearly degrades after training cutoff confirms data contamination concerns rather than challenging conventional wisdom."
    502     },
    503     "fear_safety": {
    504       "score": 0,
    505       "justification": "No safety, security, or risk concerns are raised by this evaluation methodology paper."
    506     },
    507     "drama_conflict": {
    508       "score": 1,
    509       "justification": "Touches on the 'benchmarks are contaminated' narrative and shows some models generalize poorly, but stops short of making provocative claims about specific companies."
    510     },
    511     "demo_ability": {
    512       "score": 2,
    513       "justification": "Code and data are released on GitHub, allowing reproduction of compression evaluations on new models."
    514     },
    515     "brand_recognition": {
    516       "score": 1,
    517       "justification": "Evaluates well-known models (LLaMA, Mistral, CodeLlama) but authors are from universities rather than major AI labs."
    518     }
    519   }
    520 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs