ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24697B)


      1 {
      2   "paper": {
      3     "title": "Information Capacity: Evaluating the Efficiency of Large Language Models via Text Compression",
      4     "authors": ["Cheng Yuan", "Jiawei Shao", "Xuelong Li"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.08066",
      8     "doi": "10.48550/arXiv.2511.08066"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "The paper introduces 'information capacity,' a metric evaluating LLM efficiency via text compression relative to computational complexity. Evaluating 52 open-source models across 5 datasets, the authors find consistent information capacity within model series, strong linguistic biases across models, and identify tokenizer efficiency, pretraining data, and MoE architecture as three dominant factors. The metric enables single-reference performance prediction across model scales with errors under ±8%, outperforming power-law-based approaches.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub link provided: https://github.com/TeleAI-AI-Flow/InformationCapacity"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "HuggingFace dataset link provided: https://huggingface.co/datasets/TeleAI-AI-Flow/InformationCapacity"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. A code repository is linked but no README or reproduction guide is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as point estimates throughout. No confidence intervals or error bars are provided for information capacity measurements."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes comparative claims about model rankings and correlations but uses no statistical significance tests. Pearson correlation coefficients are reported without p-values."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported contextually throughout. For example, Table 3 shows IC gains (e.g., ↑0.0104), Table 6 shows IC reductions from temperature changes, and Figure 7 shows percentage estimation errors. Differences are presented with enough baseline context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Dataset sizes are stated (e.g., 200,000 samples for mixed text) but no justification is given for why these sizes were chosen or whether they are sufficient for stable measurements."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Information capacity values appear to be single-run measurements."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares information capacity against the power law method from Kaplan et al. (2020) for performance prediction (Section 4.4, Figure 7)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Comparisons include the densing law (Xiao et al., 2025) discussed in Sections 5.2-5.3, and the power law (Kaplan et al., 2020). These represent the relevant prior work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 4.3 presents ablation studies on post-training (4.3.1), test sample length (4.3.2), and softmax temperature (4.3.3), examining their impact on information capacity."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses information capacity as the primary metric but also reports NLL, FLOPs, text size per token, and correlates with benchmark scores (MMLU, LiveCodeBench, C-Eval)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant for evaluating a compression-based efficiency metric for LLMs."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Five separate evaluation datasets are used (Mixed text, FinePDFs-en, Ch-FineWeb-Edu, FineWeb-Edu, NextCoder). These are publicly available datasets used for evaluation, not used for any training or tuning."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per dataset (Table 2), per model series, and across multiple dimensions (tokenizer efficiency in Figure 3, pretraining data in Table 3, MoE in Table 4, temperature in Table 6)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where information capacity is limited: post-training degrades it (Section 4.3.1), linguistic biases cause poor performance on non-native-language corpora (Section 4.1), and the offset approach's limitations."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative results include: post-training degrades information capacity (Section 4.3.1), temperature changes from T=1 always reduce IC (Table 6), and that the formula without offset (Eq. 4) is inadequate (Figure 2)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about consistent IC within model series (Figure 1, Table 2), linguistic biases (Table 2 rankings), three major factors (Sections 4.2.1-4.2.3), and performance prediction (Section 4.4) are all supported by results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are made about tokenizer efficiency, pretraining data, and MoE architecture affecting IC. The ablation studies (Section 4.3) provide controlled manipulations. The TinyLlama pretraining data experiment (Table 3) uses the same model with different data volumes."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims to provide 'a unified metric of LLM efficiency' but tests only on text compression across open-source models. The title and abstract frame results broadly as evaluating LLM 'efficiency' without bounding to the tested setting of text compression performance."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the consistent information capacity within model series. The offset parameter b is introduced to force consistency but no theoretical justification is given for why it should be constant."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures text compression performance and frames it as 'efficiency' and 'intelligence.' While the correlation between compression and intelligence is cited, the gap between compression performance and actual model capability/efficiency is not substantively discussed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model names with sizes are provided throughout (e.g., 'Qwen3-0.6B-Base', 'Llama-3.2-1B', 'TinyLlama-1.1B'). For the 52 models evaluated, size and variant are specified."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. Models are used as probability estimators on input text for compression, not prompted for generation."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 3.2 specifies: temperature T=1, sequence length L=1024, logits promoted from bfloat16 to float32. The offset values for each dataset are given in Table 1."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. Models are evaluated as probability estimators."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.2 describes preprocessing: text samples are truncated to L=1024 tokens, the first token is excluded, ineffective logits are truncated before softmax, and samples shorter than the threshold are filtered out. Table 1 provides dataset details."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The Discussion section (Section 5) discusses some design choices but does not substantively address limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The paper does not address concerns like the arbitrary offset parameter, the choice of sequence length, or the stability of rankings."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what IC does not measure. It acknowledges that post-training models cannot be fairly evaluated (Section 4.3.1) and that base models of some series are unavailable, but does not bound the scope of the metric's applicability."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The evaluation datasets are publicly available (HuggingFace link provided, and references to FinePDFs-en, Ch-FineWeb-Edu, FineWeb-Edu, NextCoder). Code is also released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 describes the five datasets with sources, sample counts, minimum/average lengths (Table 1). The Mixed text corpus composition (books, webpages, code, papers) is described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public datasets/benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from text to IC is documented: tokenization → truncation to L=1024 → logit extraction → NLL computation → FLOPs calculation (Eq. 6-8) → IC computation (Eq. 5). Filtering criteria (minimum length) are stated in Table 1."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section is present."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: 'Institute of Artificial Intelligence (TeleAI), China Telecom.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed. Authors are from China Telecom (TeleAI), which develops AI products. Whether this creates a conflict is unclear, but the lack of disclosure is a concern."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate models on capability benchmarks. It measures compression performance (NLL) on text corpora, which is not a benchmark of model knowledge that could be contaminated."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper measures text compression, not benchmark task performance. Overlap with pretraining data would affect NLL but is a feature of the compression task, not a contamination issue in the traditional sense."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the paper evaluates compression performance, not knowledge-based benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Despite FLOPs being central to the metric, the paper does not report the actual computational cost (GPU hours, wall-clock time) of running the evaluation across 52 models and 5 datasets."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget is stated for running the experiments. The paper evaluates 52 models on hundreds of thousands of samples but does not report the hardware or time required."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. The NLL computation is deterministic given a model and text, so seed sensitivity is less critical, but this is not discussed."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No statement about number of runs. NLL computation is deterministic for a given input, so single runs may be appropriate, but this is not explicitly stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The offset parameter b is a key hyperparameter (Table 1 shows different values per dataset), but no search budget or selection methodology for b is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The offset values in Table 1 differ per dataset (-24, -27, -18.7, -27, -27) but the selection process is not justified. How these values were determined is not explained."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose information capacity as a new metric and evaluate it, but do not acknowledge the bias of evaluating their own metric against alternatives."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "The metric itself is about compute vs. performance. The evaluation method has negligible compute differences across models (forward pass only)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 4.5 discusses the correlation between IC and benchmark scores (MMLU, LiveCodeBench, C-Eval) and acknowledges that the correlation depends on dataset-benchmark alignment. Section 5.1 discusses what IC captures."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are used directly as probability estimators."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The evaluation datasets (FineWeb-Edu, etc.) may overlap with model pretraining data. The paper does not discuss whether models were trained on these evaluation corpora, which would inflate compression performance."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Feature leakage is not a concern here — the task is straightforward NLL computation on text."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The evaluation datasets may overlap with pretraining corpora. For example, FineWeb-Edu is a well-known pretraining dataset that some evaluated models may have been trained on. This is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are used to check whether evaluation data appeared in model pretraining corpora."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "A series of models with varying sizes exhibits consistent information capacity.",
    365       "evidence": "Figure 1 and Table 2 show IC values for model families (e.g., Qwen3, Llama-3) remaining roughly constant across sizes within each series.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Tokenizer efficiency is the dominant factor in information capacity, with Pearson correlation coefficients consistently exceeding 0.98.",
    370       "evidence": "Figure 3 shows linear correlations between text size per token and IC across 4 datasets, with r values from 0.981 to 0.998.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Information capacity enables accurate cross-scale NLL prediction from a single reference model with errors bounded within ±8%.",
    375       "evidence": "Figures 6 and 7 show prediction errors: Qwen3 within ±3% (Figure 6a), Qwen2.5 within ±7.73% (Figure 7a), compared to power law errors exceeding 25% (Figure 7b).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Post-training degrades information capacity.",
    380       "evidence": "Figure 4 and Section 4.3.1 show IC drops for all tested models after post-training (e.g., Qwen3-Base vs Qwen3, Llama-3 vs Llama-3-Instruct).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "MoE architecture enhances information capacity by reducing FLOPs while maintaining prediction accuracy.",
    385       "evidence": "Table 4 shows MoE models (Qwen1.5-MoE, Qwen2-MoE) achieving higher IC than dense variants with similar activated parameters.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Strong linguistic biases exist in mainstream LLMs, with rankings varying significantly across languages.",
    390       "evidence": "Table 2 shows Llama-3 ranks 11th on Chinese (Ch-FineWeb-Edu) but 3rd on English PDFs (FinePDFs-en). Gemma-3 similarly underperforms on Chinese.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Arbitrary offset parameter",
    397       "detail": "The offset b in Eq. 5 is introduced to force consistency of IC across model sizes, with different values per dataset (Table 1), but no principled derivation or selection methodology is provided. This parameter could be tuned to make any metric appear consistent."
    398     },
    399     {
    400       "flag": "Potential pretraining data overlap with evaluation corpora",
    401       "detail": "Evaluation datasets like FineWeb-Edu are commonly used as pretraining data. Models trained on FineWeb-Edu would achieve lower NLL and higher IC on that dataset, confounding the metric. This is not discussed."
    402     },
    403     {
    404       "flag": "No limitations section",
    405       "detail": "The paper lacks any limitations discussion, which is unusual for a methods paper introducing a new metric."
    406     },
    407     {
    408       "flag": "Company evaluating metric that could benefit its products",
    409       "detail": "Authors are from China Telecom (TeleAI). The AI Flow framework referenced as a use case (Shao and Li, 2025) is co-authored by the same group. The metric's application to device-edge-cloud infrastructure aligns with their corporate interests."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Scaling laws for neural language models",
    415       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    416       "year": 2020,
    417       "arxiv_id": "2001.08361",
    418       "relevance": "Foundational scaling laws paper that information capacity aims to improve upon for performance prediction."
    419     },
    420     {
    421       "title": "Training compute-optimal large language models",
    422       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    423       "year": 2022,
    424       "relevance": "Chinchilla scaling laws for compute-optimal training, a key baseline for model efficiency analysis."
    425     },
    426     {
    427       "title": "Densing law of LLMs",
    428       "authors": ["Chaojun Xiao", "Jie Cai"],
    429       "year": 2025,
    430       "relevance": "Direct competitor metric for LLM efficiency that normalizes inference complexity via equivalent parameter count."
    431     },
    432     {
    433       "title": "Language modeling is compression",
    434       "authors": ["Gregoire Deletang", "Anian Ruoss"],
    435       "year": 2024,
    436       "relevance": "Establishes the theoretical link between language modeling and compression that motivates information capacity."
    437     },
    438     {
    439       "title": "Compression represents intelligence linearly",
    440       "authors": ["Yuzhen Huang", "Jinghan Zhang"],
    441       "year": 2024,
    442       "relevance": "Empirically validates the correlation between compression and intelligence that information capacity builds upon."
    443     },
    444     {
    445       "title": "Scaling LLM test-time compute optimally can be more effective than scaling parameters for reasoning",
    446       "authors": ["Charlie Victor Snell", "Jaehoon Lee"],
    447       "year": 2025,
    448       "relevance": "Test-time scaling work that motivates the need for inference efficiency metrics."
    449     },
    450     {
    451       "title": "Beyond chinchilla-optimal: accounting for inference in language model scaling laws",
    452       "authors": ["Nikhil Sardana", "Jacob Portes"],
    453       "year": 2024,
    454       "relevance": "Extends scaling laws to account for inference costs, directly relevant to LLM efficiency evaluation."
    455     },
    456     {
    457       "title": "Attention is all you need",
    458       "authors": ["Ashish Vaswani", "Noam Shazeer"],
    459       "year": 2017,
    460       "relevance": "Foundational transformer architecture paper underlying all evaluated models."
    461     },
    462     {
    463       "title": "DeepSeek-V2: A strong, economical, and efficient mixture-of-experts language model",
    464       "authors": ["DeepSeek-AI"],
    465       "year": 2024,
    466       "arxiv_id": "2405.04434",
    467       "relevance": "Introduces MLA attention mechanism evaluated in the paper's FLOPs calculations."
    468     }
    469   ]
    470 }

Impressum · Datenschutz