ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24016B)


      1 {
      2   "paper": {
      3     "title": "Towards Fundamental Language Models: Does Linguistic Competence Scale with Model Size?",
      4     "authors": ["Jaime Collado-Montañez", "L. Alfonso Ureña-López", "Arturo Montejo-Ráez"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.02225",
      8     "doi": "10.48550/arXiv.2509.02225"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Linguistic competence stabilizes at moderate model sizes (5-7B parameters) while internal factual knowledge continues to scale steeply with model size. Linear regression shows model size explains 81% of variance in internal factual knowledge but only 50% for linguistic competence, with a slope twice as steep for factual knowledge. Mann-Whitney U tests show no statistically significant improvement from medium to large models for any competence. Results support the proposed Fundamental Language Model paradigm of smaller linguistically competent models with external factual retrieval.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository or download link is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All benchmarks used (WiC, BLiMP, RTE, MNLI, QQP, LAMBADA, BoolQ, COPA, MultiRC, ReCoRD, TriviaQA, TruthfulQA) are publicly available standard benchmarks, and evaluation was done through the publicly available LM Evaluation Harness framework."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper states 'Every experiment has been executed on two NVIDIA Ampere A100 GPUs' (Section 4) but provides no software versions, library versions, or environment specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions, scripts, or step-by-step guide are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 1, 3-6 are reported as point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Mann-Whitney U tests are used for pairwise comparisons between size categories (Section 4.1.2, Table 2), with p-values reported at α=0.05."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage performance differences (e.g., 'IFK shows a 39.50% performance difference between large and small models, while Linguistic Competence and EFK only shows 18.29% and 8.47% respectively') and R² values (0.81 vs 0.50) and regression slopes (0.059 vs 0.029) in Section 4.1."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 23 models were selected or whether this sample size is adequate for the statistical analyses performed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results appear to be single-run evaluations. No standard deviations, variance, or multiple-run results are reported."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple model families across different sizes serve as comparisons (SmolLM2, Qwen2.5, Llama-3, OLMo-2, Falcon3, Gemma-2, Yi-1.5), though there is no 'baseline method' per se since this is a comparative scaling study."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Models evaluated include recent releases: OLMo-2-0325, Qwen2.5, Llama-3.2, Falcon3, Gemma-2 — all from 2024-2025."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is a comparative scaling study evaluating existing models, not proposing a system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple benchmarks are used within each competence category: 5 benchmarks for linguistic competence (WiC, BLiMP, RTE, MNLI, QQP), 5 for external factual knowledge, and 4 for internal factual knowledge."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant here; the paper measures model performance on established benchmarks with automated metrics."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard benchmark test sets are used via the LM Evaluation Harness framework, which uses the established test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Detailed per-task breakdowns are provided in Tables 3-6 (linguistic subcompetences, semantic tasks, external FK tasks, internal FK tasks) in addition to aggregate scores."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No qualitative discussion of where or why specific models fail on specific benchmarks. Only aggregate scores are analyzed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that medium-to-large improvements are not statistically significant (Table 2), and that external factual knowledge does not improve with scale beyond a threshold, which are effectively negative findings about scaling."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims that 'internal factual knowledge grows significantly faster' are supported by the regression analysis (R²=0.81 vs 0.50, slope 0.059 vs 0.029) and that 'linguistic competence and factual knowledge improve with scale' is shown in Table 1."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal-adjacent claims like 'model size is more closely tied to memorization than to core language ability' (abstract) and that model size 'drives' performance differences. However, model families differ in architecture and training data, so model size is confounded with these variables. The paper acknowledges this in limitations but the main text does not hedge the causal framing."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper tests only English benchmarks but proposes a general 'Fundamental Language Model' paradigm without bounding claims to English. The abstract says 'a path toward more efficient, interpretable, and sustainable NLP solutions' broadly. The limitations acknowledge this but the main claims are unbounded."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The limitations section discusses that architecture and training data decisions matter (not just size), that language-factual boundaries are hard to draw, and that results may not generalize across languages or tasks. Section 4 also notes architecture/training matter: 'depending more on architecture and training data decisions than on model size.'"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly defines 'linguistic competence' using the CEFR framework (Section 1), maps it to specific sub-competences (lexical, grammatical, semantic), and maps each to specific benchmarks (Section 3.1). The proxy-outcome relationship is transparent — benchmark scores proxy for linguistic competence as defined by CEFR."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model names with sizes are given (e.g., 'SmolLM2-135M', 'Qwen2.5-0.5B', 'OLMo-2-0325-32B', 'Llama-3.2-1B'). These include version identifiers (e.g., OLMo-2-0325 vs OLMo-2-1124). However some like 'gemma-2-2b' lack snapshot dates."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper states 'zero-shot setting' and uses the LM Evaluation Harness but does not provide the actual prompt templates used for each benchmark."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters (temperature, top-p, max tokens, decoding strategy) are reported for the model evaluations."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used; models are evaluated directly on benchmarks."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No description of how benchmark data was preprocessed or how the LM Evaluation Harness formatted inputs. The paper references 'standard evaluation protocols' without specifying them."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present after the conclusions, discussing English-only evaluation, architecture dependence, and linguistic-factual boundary challenges."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The limitations section raises specific threats: English-only benchmarks may not generalize, the linguistic-factual boundary is hard to draw ('understanding metaphors, cultural references, or domain-specific terminology often depends on both'), and findings may not extend to other architectures."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The limitations section explicitly states what was not tested: other languages, other architectures, real-world deployment, and hybrid approaches. It also acknowledges that phonological/orthoepic/orthographic competences are excluded (Section 1)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw evaluation outputs, per-example predictions, or underlying data are made available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 describes in detail which benchmarks were selected for each competence and why, with citations for each benchmark. The evaluation framework (LM Evaluation Harness) is identified."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants; all data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw benchmark outputs to the aggregated scores in Tables 1, 3-6 is not documented. For example, how ReCoRD's F1 and exact match are averaged, or how TruthfulQA Generation's multiple metrics are combined, is mentioned briefly but the full pipeline is not made reproducible."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists multiple Spanish government grants (CONSENSO, MODERATES, SocialTOX), an FPI scholarship, and EU NextGenerationEU funding."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are affiliated with the Computer Science Department, University of Jaén, Spain. No evaluated product is their own."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding comes from Spanish government and EU research grants, which have no financial stake in whether smaller models are linguistically competent."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the 23 models evaluated."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether any benchmark data (e.g., WiC, BLiMP, MNLI) appeared in the training data of the models."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Many benchmarks used (BLiMP 2020, MNLI 2018, WiC 2019, TriviaQA 2017) predate all models and are widely available online. No contamination analysis is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or time per evaluation is reported despite running 23 models across 14 benchmarks."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Only 'two NVIDIA Ampere A100 GPUs' is mentioned. No total GPU hours, wall-clock time, or compute budget is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs per model-benchmark pair is never stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "This is an evaluation study using pre-trained models with the LM Evaluation Harness defaults; no hyperparameter tuning is performed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "No configuration selection is performed; models are evaluated with default settings via the LM Evaluation Harness."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Nine Mann-Whitney U tests are performed (3 competences × 3 pairwise comparisons, Table 2) without any correction for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The authors do not propose a system; they evaluate existing models on public benchmarks. Self-comparison bias does not apply."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Performance is plotted against model size (parameter count) but not against compute budget (FLOPs, GPU hours, training cost). Larger models require more compute at inference too, which is not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3 provides a detailed mapping from the CEFR framework's linguistic competence definition to specific benchmarks, justifying why each benchmark measures the claimed construct (e.g., WiC for lexical competence, BLiMP for grammatical competence)."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved; models are evaluated directly on benchmarks."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage despite many benchmarks (BLiMP 2020, MNLI 2018, WiC 2019, TriviaQA 2017) predating all models by years."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setups leak information through context formatting or prompting."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark examples may overlap with training data across model families."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Internal factual knowledge scales significantly faster with model size than linguistic competence.",
    365       "evidence": "Linear regression: R²=0.81 for IFK vs R²=0.50 for linguistic competence; slope 0.059 vs 0.029 (Section 4.1.1, Figure 2). IFK shows 39.50% performance gap between large/small models vs 18.29% for linguistic competence (Section 4.2).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Linguistic competence stabilizes at moderate model sizes (5-7B parameters).",
    370       "evidence": "Mann-Whitney U tests show no significant improvement from medium to large models for linguistic competence (p=0.062, Table 2). Qwen2.5-3B scores 0.6909 vs OLMo-2-0325-32B at 0.7095 (Table 1).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "External factual knowledge does not continue to improve with model size after a certain threshold.",
    375       "evidence": "Gemma-2-9b (9.24B) achieves highest EFK score (0.7961), outperforming 32B models. Mann-Whitney U test medium vs large p=0.591 (Table 2).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The FLM paradigm — smaller linguistically competent models with external factual retrieval — is viable.",
    380       "evidence": "Supported indirectly by the scaling analysis showing linguistic competence saturation. No direct demonstration of an FLM system with external retrieval.",
    381       "supported": "weak"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "No contamination analysis",
    387       "detail": "All benchmarks predate the models by years (BLiMP 2020, MNLI 2018, WiC 2019, TriviaQA 2017). Models may have trained on benchmark data, which would inflate scores — particularly for linguistic benchmarks, undermining the core finding that linguistic competence saturates early."
    388     },
    389     {
    390       "flag": "Architecture/training confounded with size",
    391       "detail": "Different model families (Qwen, Llama, OLMo, etc.) have different architectures and training data. The paper attributes differences to model size but cannot separate size from architecture and training effects. For example, Qwen2.5-3B outperforms 6-10B models from other families on linguistic tasks."
    392     },
    393     {
    394       "flag": "No multiple comparison correction",
    395       "detail": "Nine statistical tests are performed without Bonferroni or other corrections. At α=0.05 with 9 tests, the chance of at least one false positive is ~37%."
    396     },
    397     {
    398       "flag": "Single-run results with no variance",
    399       "detail": "All results appear to be single evaluations with no variance or confidence intervals reported, making it impossible to assess result stability."
    400     },
    401     {
    402       "flag": "No hyperparameters reported",
    403       "detail": "Temperature, sampling strategy, and other inference parameters are not reported for any of the 23 models, despite these settings affecting benchmark performance."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Scaling laws for neural language models",
    409       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    410       "year": 2020,
    411       "arxiv_id": "2001.08361",
    412       "relevance": "Foundational work on neural scaling laws that this paper builds upon to argue for linguistic competence saturation."
    413     },
    414     {
    415       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    416       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    417       "year": 2022,
    418       "relevance": "Benchmark used for internal factual knowledge evaluation; relevant to LLM evaluation methodology."
    419     },
    420     {
    421       "title": "Are emergent abilities of large language models a mirage?",
    422       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    423       "year": 2023,
    424       "relevance": "Challenges emergent abilities narrative; directly relevant to claims about scaling properties of LLM capabilities."
    425     },
    426     {
    427       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    428       "authors": ["Patrick Lewis", "Ethan Perez"],
    429       "year": 2020,
    430       "relevance": "RAG foundational paper; the FLM paradigm proposes externalizing factual knowledge similarly to RAG."
    431     },
    432     {
    433       "title": "A framework for few-shot language model evaluation",
    434       "authors": ["Leo Gao", "Jonathan Tow"],
    435       "year": 2024,
    436       "relevance": "The LM Evaluation Harness used as the evaluation framework in this study."
    437     },
    438     {
    439       "title": "TinyStories: How small can language models be and still speak coherent english?",
    440       "authors": ["Ronen Eldan", "Yuanzhi Li"],
    441       "year": 2023,
    442       "arxiv_id": "2305.07759",
    443       "relevance": "Directly relevant to the FLM thesis — demonstrates small models can achieve linguistic coherence."
    444     },
    445     {
    446       "title": "A survey on evaluation of large language models",
    447       "authors": ["Yupeng Chang"],
    448       "year": 2024,
    449       "relevance": "Survey of LLM evaluation methods and benchmarks; relevant to evaluation methodology."
    450     },
    451     {
    452       "title": "ZebraLogic: On the scaling limits of LLMs for logical reasoning",
    453       "authors": ["Bill Yuchen Lin"],
    454       "year": 2025,
    455       "relevance": "Argues LLMs may have reached peak reasoning capabilities despite scale; supports the FLM thesis."
    456     }
    457   ]
    458 }

Impressum · Datenschutz