scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20820B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Information Capacity: Evaluating the Efficiency of Large Language Models via Text Compression",
      6     "authors": [
      7       "Cheng Yuan",
      8       "Jiawei Shao",
      9       "Chi Zhang",
     10       "Xuelong Li"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2511.08066",
     15     "doi": "10.48550/arXiv.2511.08066"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All five abstract claims (consistent IC within series, linguistic biases, three major factors, performance prediction accuracy, correlation with benchmarks) are backed by corresponding figures and tables in the paper.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper asserts causal language ('post-training impairs,' 'MoE architecture enhances') but compares pre-existing model variants rather than controlled interventions; confounders like training data differences are not ruled out.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion broadly claims IC is 'a valuable metric of model efficiency' without bounding to the scope tested (open-source base models only, closed-source models explicitly excluded, post-trained models noted as inaccurate).",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper finds tokenizer efficiency has r=0.98 correlation with IC but does not seriously engage with the alternative that IC is primarily measuring tokenizer quality rather than model intelligence, undermining the core framing.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper uses NLL/compression gain as 'model intelligence' without adequately distinguishing between compression ability and intelligence as constructs; the equivalence is asserted by citing prior work rather than argued in-paper.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "There is no dedicated limitations section; scattered constraints (closed-source models excluded, post-training degrades accuracy) appear inline but are not consolidated.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No threats-to-validity framing is present; the possibility that evaluation datasets overlap with model pretraining corpora (contamination) is never mentioned despite being a direct threat to the compression-based measurement.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not state explicit scope boundaries for where IC should not be applied; the metric is presented as broadly applicable to LLMs without qualification.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure is present anywhere in the paper; the authors' affiliation with China Telecom's TeleAI institute is stated but no funding source is explicitly acknowledged.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors are identified as affiliated with 'Institute of Artificial Intelligence (TeleAI), China Telecom,' clearly disclosed on the first page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The paper promotes the 'AI Flow framework' (Shao and Li, 2025; An et al., 2025) developed by the same institution, and rankings favor models from Chinese companies whose success may align with TeleAI's interests.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no competing interests statement or financial disclosure of any kind in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Information capacity, model intelligence (NLL/compression gain), inference complexity (FLOPs), and tokenizer efficiency are all explicitly defined with formulas in Sections 2-3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper clearly states it introduces 'information capacity' as a new metric and contributes evaluation of 52 models across 5 datasets plus a performance prediction method.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper actively situates IC against scaling laws (Kaplan et al., Hoffmann et al.), the densing law (Xiao et al., 2025), and the compression-intelligence correlation (Huang et al., 2024; Deletang et al., 2024), explaining how it differs from each.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "benchmark-creation": {
    119       "construct_design": {
    120         "construct_validity_argued": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper argues in Sections 1-2 that text compression performance measures intelligence (via the training objective equivalence) and that dividing by log2(FLOPs) captures efficiency; the theoretical chain is laid out explicitly.",
    124           "source": "haiku"
    125         },
    126         "difficulty_distribution_characterized": {
    127           "applies": false,
    128           "answer": false,
    129           "justification": "Information capacity is a continuous scalar metric applied to text samples rather than a benchmark with discrete items; difficulty distribution is not a meaningful concept in this framework.",
    130           "source": "haiku"
    131         },
    132         "ceiling_floor_effects_checked": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper does not check whether IC discriminates poorly at extremes; with r=0.98 correlation to tokenizer efficiency alone, models with similar tokenizers may cluster and lose discriminative power.",
    136           "source": "haiku"
    137         },
    138         "human_baseline_included": {
    139           "applies": false,
    140           "answer": false,
    141           "justification": "Human performance is not applicable to this metric, which measures LLM compression efficiency relative to FLOPs; no human participants are involved.",
    142           "source": "haiku"
    143         },
    144         "scoring_rubric_justified": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "The formula is mathematically derived but the critical offset parameter b is chosen empirically ('we find that a fixed offset is sufficient') with no principled justification for its value or selection method.",
    148           "source": "haiku"
    149         }
    150       },
    151       "robustness": {
    152         "contamination_resistance_designed": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The evaluation datasets (FineWeb-Edu, FinePDFs-en, etc.) may have been in models' pretraining corpora; memorized text would compress better, artificially inflating IC — this risk is never acknowledged or addressed.",
    156           "source": "haiku"
    157         },
    158         "temporal_robustness_discussed": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "The paper does not discuss whether IC will remain discriminative as models evolve, nor whether future models could be specifically optimized to inflate IC without genuine efficiency gains.",
    162           "source": "haiku"
    163         },
    164         "failure_modes_discussed": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No systematic discussion of metric failure modes; the acknowledged limitation that post-trained models cannot be evaluated is noted in passing but not framed as a failure mode with implications.",
    168           "source": "haiku"
    169         },
    170         "baseline_implementations_provided": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Code is provided at https://github.com/TeleAI-AI-Flow/InformationCapacity and the evaluation dataset is on HuggingFace, enabling reproduction of reported numbers.",
    174           "source": "haiku"
    175         }
    176       },
    177       "documentation": {
    178         "dataset_documentation_complete": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "The proprietary 'Mixed text' dataset is described only as 'multilingual text corpus from diverse sources, including books, webpages, code, and published papers' with no data card, provenance, preprocessing steps, or deduplication methodology.",
    182           "source": "haiku"
    183         },
    184         "licensing_and_access_clear": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The HuggingFace dataset and GitHub repository are linked but no licensing terms are specified in the paper for either the code or the datasets.",
    188           "source": "haiku"
    189         },
    190         "intended_use_specified": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "The paper explains what IC can be used for (compare model series, predict performance) but does not specify what should NOT be concluded from the metric results.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "Models within a series exhibit consistent information capacity regardless of model size.",
    202       "evidence": "Figure 1 shows IC values cluster per series across FLOPs range; Figure 6 shows NLL prediction errors within ±3% for Qwen3.",
    203       "supported": "moderate"
    204     },
    205     {
    206       "claim": "Tokenizer efficiency is the dominant factor in information capacity, with Pearson r > 0.98 across datasets.",
    207       "evidence": "Figure 3 shows near-perfect linear correlation between text size per token and IC across four datasets.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "Mainstream open-source LLMs exhibit strong linguistic biases, with rankings varying significantly across language-specific datasets.",
    212       "evidence": "Table 2 shows Llama-3 ranks 3rd on FineWeb-Edu but 11th on Ch-FineWeb-Edu; Gemma-3 ranks 7th on English but 9th on Chinese.",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "Post-training impairs compression performance, with more advanced RL-based post-training causing more severe degradation.",
    217       "evidence": "Figure 4 shows instruct variants have lower IC than base models; Qwen3's multi-stage RL shows larger drop than SFT-only models.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "MoE architecture achieves higher IC than dense models with equivalent activated parameters due to extended total parameters.",
    222       "evidence": "Table 4 shows Qwen1.5-MoE (2.7B active) achieves IC 0.2146 vs. dense Qwen1.5-4B at 0.2022.",
    223       "supported": "moderate"
    224     },
    225     {
    226       "claim": "IC-based performance prediction is significantly more accurate than the power law, with errors under 8% vs. over 25%.",
    227       "evidence": "Figure 7 directly compares IC method (-6% to +7.7%) vs power law (-20% to +26%) on Qwen2.5 series.",
    228       "supported": "strong"
    229     },
    230     {
    231       "claim": "Information capacity correlates with benchmark scores (MMLU r=0.715 when evaluated on domain-matched corpus).",
    232       "evidence": "Figure 8 shows correlation is highest when IC corpus matches benchmark domain; drops to r=0.456 for mismatched domains.",
    233       "supported": "weak"
    234     }
    235   ],
    236   "methodology_tags": [
    237     "benchmark-eval",
    238     "observational",
    239     "theoretical"
    240   ],
    241   "key_findings": "Information capacity, defined as compression gain divided by log2(FLOPs), is remarkably consistent within a model series, enabling single-reference performance prediction with <8% error — far outperforming the power law. Tokenizer efficiency is the dominant factor with r>0.98 correlation to IC across all tested datasets, raising questions about whether IC primarily measures tokenizer quality rather than model intelligence. Mainstream LLMs exhibit strong linguistic biases: Chinese-company models dominate Chinese corpus rankings while Meta and Google models perform poorly. Post-training consistently degrades compression performance, with more sophisticated RL methods causing larger drops.",
    242   "red_flags": [
    243     {
    244       "flag": "Tokenizer dominance confound",
    245       "detail": "With r=0.98 correlation between tokenizer efficiency and IC, the metric may be primarily measuring tokenizer byte-per-token ratios rather than model intelligence; the paper labels this a 'dominant factor' without acknowledging that it largely collapses the metric to a tokenizer ranking."
    246     },
    247     {
    248       "flag": "Ad hoc offset parameter",
    249       "detail": "The key offset b that makes IC constant within a model series is selected empirically ('we find that a fixed offset is sufficient') with no principled derivation, cross-validation, or sensitivity analysis — it is hand-tuned to produce the desired flat curves."
    250     },
    251     {
    252       "flag": "No contamination check",
    253       "detail": "Evaluation datasets (FineWeb-Edu, FinePDFs-en, etc.) may overlap with models' pretraining corpora; memorized text compresses trivially, and the paper never addresses this fundamental threat to measurement validity."
    254     },
    255     {
    256       "flag": "No limitations section",
    257       "detail": "The paper has no dedicated limitations or threats-to-validity section; constraints are scattered across ablation studies without systematic treatment."
    258     },
    259     {
    260       "flag": "Mixed text dataset undocumented",
    261       "detail": "The proprietary 'Mixed text' evaluation corpus — one of five key datasets — lacks provenance, preprocessing documentation, and data card, making results on it unreproducible."
    262     },
    263     {
    264       "flag": "Institutional self-promotion",
    265       "detail": "The paper repeatedly references the authors' own 'AI Flow framework' (3 self-citations) in motivation sections without this being central to the contribution, and no funding or competing interest is disclosed."
    266     }
    267   ],
    268   "cited_papers": [
    269     {
    270       "title": "Language Modeling is Compression",
    271       "relevance": "Core theoretical foundation: establishes that LLMs can function as lossless compressors via arithmetic coding, directly motivating the information capacity metric."
    272     },
    273     {
    274       "title": "Compression Represents Intelligence Linearly",
    275       "relevance": "Provides empirical validation of the compression-intelligence correlation that the paper builds upon as its central premise."
    276     },
    277     {
    278       "title": "Densing Law of LLMs",
    279       "relevance": "Primary competitor metric that information capacity is compared against; paper argues IC avoids the biases of the densing law's equivalent-parameter approach."
    280     },
    281     {
    282       "title": "Scaling Laws for Neural Language Models",
    283       "relevance": "Foundational scaling law used as baseline for performance prediction comparison; IC is shown to outperform the power law formulation."
    284     },
    285     {
    286       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    287       "relevance": "Key prior work on scaling laws that information capacity is positioned to complement with an efficiency-focused metric."
    288     },
    289     {
    290       "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale",
    291       "relevance": "Source of two evaluation datasets (FineWeb-Edu) used for benchmark evaluation."
    292     },
    293     {
    294       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    295       "relevance": "Primary benchmark used to validate IC's correlation with downstream task performance in Section 4.5."
    296     },
    297     {
    298       "title": "Qwen2 Technical Report",
    299       "relevance": "One of the primary model families evaluated, used for MoE vs. dense comparisons and pretraining data ablations."
    300     }
    301   ],
    302   "engagement_factors": {
    303     "practical_relevance": {
    304       "score": 2,
    305       "justification": "Practitioners can directly apply IC to compare model efficiency tradeoffs before deployment, with code and dataset publicly available."
    306     },
    307     "surprise_contrarian": {
    308       "score": 2,
    309       "justification": "Finding that tokenizer efficiency (r=0.98) dominates IC more than model intelligence is surprising and challenges the framing of the metric itself."
    310     },
    311     "fear_safety": {
    312       "score": 0,
    313       "justification": "No AI risk or safety concerns raised; purely an efficiency measurement paper."
    314     },
    315     "drama_conflict": {
    316       "score": 0,
    317       "justification": "No controversy or conflicting claims with established work; paper extends rather than challenges consensus."
    318     },
    319     "demo_ability": {
    320       "score": 2,
    321       "justification": "Code on GitHub and dataset on HuggingFace allow immediate reproduction of the leaderboard for any open-source base model."
    322     },
    323     "brand_recognition": {
    324       "score": 1,
    325       "justification": "China Telecom / TeleAI is a large telecom company but not a recognized top-tier AI lab; paper evaluates well-known models (Llama, Qwen, DeepSeek) lending indirect recognition."
    326     }
    327   },
    328   "hn_data": {
    329     "threads": [
    330       {
    331         "hn_id": "42718166",
    332         "title": "Titans: Learning to Memorize at Test Time",
    333         "points": 161,
    334         "comments": 35,
    335         "url": "https://news.ycombinator.com/item?id=42718166",
    336         "created_at": "2025-01-15T22:39:49Z"
    337       },
    338       {
    339         "hn_id": "42028873",
    340         "title": "Spann: Highly-Efficient Billion-Scale Approximate Nearest Neighbor Search (2021)",
    341         "points": 124,
    342         "comments": 33,
    343         "url": "https://news.ycombinator.com/item?id=42028873",
    344         "created_at": "2024-11-02T20:02:23Z"
    345       },
    346       {
    347         "hn_id": "42688392",
    348         "title": "Titans: Learning to Memorize at Test Time",
    349         "points": 115,
    350         "comments": 15,
    351         "url": "https://news.ycombinator.com/item?id=42688392",
    352         "created_at": "2025-01-13T20:11:14Z"
    353       },
    354       {
    355         "hn_id": "42270468",
    356         "title": "Physics in Next-Token Prediction",
    357         "points": 28,
    358         "comments": 4,
    359         "url": "https://news.ycombinator.com/item?id=42270468",
    360         "created_at": "2024-11-29T03:10:50Z"
    361       },
    362       {
    363         "hn_id": "45952746",
    364         "title": "TabPFN-2.5: Advancing the State of the Art in Tabular Foundation Models",
    365         "points": 7,
    366         "comments": 0,
    367         "url": "https://news.ycombinator.com/item?id=45952746",
    368         "created_at": "2025-11-17T11:38:20Z"
    369       },
    370       {
    371         "hn_id": "42710451",
    372         "title": "Titans: Learning to Memorize at Test Time",
    373         "points": 5,
    374         "comments": 2,
    375         "url": "https://news.ycombinator.com/item?id=42710451",
    376         "created_at": "2025-01-15T13:16:03Z"
    377       },
    378       {
    379         "hn_id": "42274748",
    380         "title": "Physics-Informed Machine Learning: A Survey",
    381         "points": 3,
    382         "comments": 1,
    383         "url": "https://news.ycombinator.com/item?id=42274748",
    384         "created_at": "2024-11-29T16:04:38Z"
    385       },
    386       {
    387         "hn_id": "2893666",
    388         "title": "Every hour of TV watched after age 25 reduces life expectancy by 22 minutes",
    389         "points": 2,
    390         "comments": 1,
    391         "url": "https://news.ycombinator.com/item?id=2893666",
    392         "created_at": "2011-08-17T01:44:53Z"
    393       },
    394       {
    395         "hn_id": "2891660",
    396         "title": "Every hour of TV watching shortens life by 22 minutes",
    397         "points": 2,
    398         "comments": 1,
    399         "url": "https://news.ycombinator.com/item?id=2891660",
    400         "created_at": "2011-08-16T15:58:19Z"
    401       },
    402       {
    403         "hn_id": "33898229",
    404         "title": "Teaching Algorithmic Reasoning via In-Context Learning",
    405         "points": 2,
    406         "comments": 0,
    407         "url": "https://news.ycombinator.com/item?id=33898229",
    408         "created_at": "2022-12-07T18:36:38Z"
    409       }
    410     ],
    411     "top_points": 161,
    412     "total_points": 449,
    413     "total_comments": 92
    414   }
    415 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs