scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31737B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Fundamental Language Models: Does Linguistic Competence Scale with Model Size?",
      6     "authors": [
      7       "Jaime Collado-Montañez",
      8       "L. Alfonso Ureña-López",
      9       "Arturo Montejo-Ráez"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2509.02225",
     14     "doi": "10.48550/arXiv.2509.02225"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims that 'internal factual knowledge grows significantly faster' are supported by the regression analysis (R²=0.81 vs 0.50, slope 0.059 vs 0.029) and that 'linguistic competence and factual knowledge improve with scale' is shown in Table 1.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes causal-adjacent claims like 'model size is more closely tied to memorization than to core language ability' (abstract) and that model size 'drives' performance differences. However, model families differ in architecture and training data, so model size is confounded with these variables. The paper acknowledges this in limitations but the main text does not hedge the causal framing.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper tests only English benchmarks but proposes a general 'Fundamental Language Model' paradigm without bounding claims to English. The abstract says 'a path toward more efficient, interpretable, and sustainable NLP solutions' broadly. The limitations acknowledge this but the main claims are unbounded.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The limitations section discusses that architecture and training data decisions matter (not just size), that language-factual boundaries are hard to draw, and that results may not generalize across languages or tasks. Section 4 also notes architecture/training matter: 'depending more on architecture and training data decisions than on model size.'",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper explicitly defines 'linguistic competence' using the CEFR framework (Section 1), maps it to specific sub-competences (lexical, grammatical, semantic), and maps each to specific benchmarks (Section 3.1). The proxy-outcome relationship is transparent — benchmark scores proxy for linguistic competence as defined by CEFR.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' section is present after the conclusions, discussing English-only evaluation, architecture dependence, and linguistic-factual boundary challenges.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The limitations section raises specific threats: English-only benchmarks may not generalize, the linguistic-factual boundary is hard to draw ('understanding metaphors, cultural references, or domain-specific terminology often depends on both'), and findings may not extend to other architectures.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The limitations section explicitly states what was not tested: other languages, other architectures, real-world deployment, and hybrid approaches. It also acknowledges that phonological/orthoepic/orthographic competences are excluded (Section 1).",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgments section lists multiple Spanish government grants (CONSENSO, MODERATES, SocialTOX), an FPI scholarship, and EU NextGenerationEU funding.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors are affiliated with the Computer Science Department, University of Jaén, Spain. No evaluated product is their own.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Funding comes from Spanish government and EU research grants, which have no financial stake in whether smaller models are linguistically competent.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Linguistic competence defined as 'model's capacity to generate and comprehend language with proficiency in linguistic structures'; further defined via CEFR framework (lexical, grammatical, semantic). FLMs, external/internal factual knowledge clearly distinguished.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contribution explicitly framed: empirical evaluation of FLM paradigm viability; testing whether 'linguistic competence remains robust in smaller models' using benchmark comparisons across model families.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Related work section (Section 2) positions work relative to transformer linguistic analysis, BabyLM challenge findings, RAG systems, and agentic AI paradigm. Shows how FLMs differ from full-scale RAG.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code repository or download link is provided anywhere in the paper.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All benchmarks used (WiC, BLiMP, RTE, MNLI, QQP, LAMBADA, BoolQ, COPA, MultiRC, ReCoRD, TriviaQA, TruthfulQA) are publicly available standard benchmarks, and evaluation was done through the publicly available LM Evaluation Harness framework.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper states 'Every experiment has been executed on two NVIDIA Ampere A100 GPUs' (Section 4) but provides no software versions, library versions, or environment specifications.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No reproduction instructions, scripts, or step-by-step guide are provided.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 1, 3-6 are reported as point estimates with no confidence intervals or error bars.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Mann-Whitney U tests are used for pairwise comparisons between size categories (Section 4.1.2, Table 2), with p-values reported at α=0.05.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper reports percentage performance differences (e.g., 'IFK shows a 39.50% performance difference between large and small models, while Linguistic Competence and EFK only shows 18.29% and 8.47% respectively') and R² values (0.81 vs 0.50) and regression slopes (0.059 vs 0.029) in Section 4.1.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No justification for why 23 models were selected or whether this sample size is adequate for the statistical analyses performed.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Results appear to be single-run evaluations. No standard deviations, variance, or multiple-run results are reported.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple model families across different sizes serve as comparisons (SmolLM2, Qwen2.5, Llama-3, OLMo-2, Falcon3, Gemma-2, Yi-1.5), though there is no 'baseline method' per se since this is a comparative scaling study.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Models evaluated include recent releases: OLMo-2-0325, Qwen2.5, Llama-3.2, Falcon3, Gemma-2 — all from 2024-2025.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": false,
    191           "answer": false,
    192           "justification": "This is a comparative scaling study evaluating existing models, not proposing a system with components to ablate.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple benchmarks are used within each competence category: 5 benchmarks for linguistic competence (WiC, BLiMP, RTE, MNLI, QQP), 5 for external factual knowledge, and 4 for internal factual knowledge.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not relevant here; the paper measures model performance on established benchmarks with automated metrics.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Standard benchmark test sets are used via the LM Evaluation Harness framework, which uses the established test splits.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Detailed per-task breakdowns are provided in Tables 3-6 (linguistic subcompetences, semantic tasks, external FK tasks, internal FK tasks) in addition to aggregate scores.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "No qualitative discussion of where or why specific models fail on specific benchmarks. Only aggregate scores are analyzed.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that medium-to-large improvements are not statistically significant (Table 2), and that external factual knowledge does not improve with scale beyond a threshold, which are effectively negative findings about scaling.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model names with sizes are given (e.g., 'SmolLM2-135M', 'Qwen2.5-0.5B', 'OLMo-2-0325-32B', 'Llama-3.2-1B'). These include version identifiers (e.g., OLMo-2-0325 vs OLMo-2-1124). However some like 'gemma-2-2b' lack snapshot dates.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The paper states 'zero-shot setting' and uses the LM Evaluation Harness but does not provide the actual prompt templates used for each benchmark.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No hyperparameters (temperature, top-p, max tokens, decoding strategy) are reported for the model evaluations.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used; models are evaluated directly on benchmarks.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "No description of how benchmark data was preprocessed or how the LM Evaluation Harness formatted inputs. The paper references 'standard evaluation protocols' without specifying them.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No raw evaluation outputs, per-example predictions, or underlying data are made available.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes in detail which benchmarks were selected for each competence and why, with citations for each benchmark. The evaluation framework (LM Evaluation Harness) is identified.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants; all data comes from standard public benchmarks.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The pipeline from raw benchmark outputs to the aggregated scores in Tables 1, 3-6 is not documented. For example, how ReCoRD's F1 and exact match are averaged, or how TruthfulQA Generation's multiple metrics are combined, is mentioned briefly but the full pipeline is not made reproducible.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoff dates are stated for any of the 23 models evaluated.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether any benchmark data (e.g., WiC, BLiMP, MNLI) appeared in the training data of the models.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Many benchmarks used (BLiMP 2020, MNLI 2018, WiC 2019, TriviaQA 2017) predate all models and are widely available online. No contamination analysis is performed.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or time per evaluation is reported despite running 23 models across 14 benchmarks.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Only 'two NVIDIA Ampere A100 GPUs' is mentioned. No total GPU hours, wall-clock time, or compute budget is stated.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single runs.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs per model-benchmark pair is never stated.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": false,
    383           "answer": false,
    384           "justification": "This is an evaluation study using pre-trained models with the LM Evaluation Harness defaults; no hyperparameter tuning is performed.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": false,
    389           "answer": false,
    390           "justification": "No configuration selection is performed; models are evaluated with default settings via the LM Evaluation Harness.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Nine Mann-Whitney U tests are performed (3 competences × 3 pairwise comparisons, Table 2) without any correction for multiple comparisons.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": false,
    401           "answer": false,
    402           "justification": "The authors do not propose a system; they evaluate existing models on public benchmarks. Self-comparison bias does not apply.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "Performance is plotted against model size (parameter count) but not against compute budget (FLOPs, GPU hours, training cost). Larger models require more compute at inference too, which is not discussed.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": true,
    414           "justification": "Section 3 provides a detailed mapping from the CEFR framework's linguistic competence definition to specific benchmarks, justifying why each benchmark measures the claimed construct (e.g., WiC for lexical competence, BLiMP for grammatical competence).",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved; models are evaluated directly on benchmarks.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of temporal leakage despite many benchmarks (BLiMP 2020, MNLI 2018, WiC 2019, TriviaQA 2017) predating all models by years.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether evaluation setups leak information through context formatting or prompting.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether benchmark examples may overlap with training data across model families.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Internal factual knowledge scales with model size (R²=0.81), while linguistic competence does not (R²=0.50)",
    455       "evidence": "Section 4.1.1, Figure 2, Table 1: regression analysis showing slope 0.059 for IFK vs 0.029 for linguistic competence; R² values explicitly reported.",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Linguistic competence stabilizes at 5-7B parameters with diminishing returns beyond that size",
    460       "evidence": "Section 4.2 and Table 1: Qwen2.5-3B (0.6909) close to Qwen2.5-32B (0.7688); Qwen2.5-7B (0.7239) outperforms larger models like Falcon3-10B (0.7167).",
    461       "supported": "moderate"
    462     },
    463     {
    464       "claim": "External factual knowledge does not improve with model size after a certain threshold",
    465       "evidence": "Section 4.2 and Table 5: Gemma-2-9b (0.7961) and Falcon3-10B (0.7746) outperform Qwen2.5-32B (0.7007); Mann-Whitney test shows no significant difference medium vs large (p=0.591).",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Large models duplicate small model internal factual knowledge scores (39.5% difference)",
    470       "evidence": "Section 4.2, Table 6: OLMo-2-0325-32B (0.5784) is ~2.4x small models; 39.5% performance gap calculated between large/small groups.",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "The FLM paradigm is viable because linguistic competence doesn't require very large models",
    475       "evidence": "Abstract, conclusions, Tables 1-3: multiple modest-sized models (3-7B) achieve strong linguistic competence comparable to 32B models.",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "Mann-Whitney tests show no significant linguistic competence improvement from medium to large models (p=0.062)",
    480       "evidence": "Table 2: Mann-Whitney U test results for linguistic competence medium vs large comparison.",
    481       "supported": "strong"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval",
    486     "observational"
    487   ],
    488   "key_findings": "The paper demonstrates that linguistic competence (grammar, vocabulary, semantic understanding) plateaus at moderate model sizes (5-7B parameters) while internal factual knowledge continues scaling with model size. This suggests the Fundamental Language Model paradigm—decoupling linguistic processing from factual retrieval—is viable, allowing smaller models to handle language understanding while offloading factual tasks to external tools. Architecture and training data choices matter more than scale for linguistic ability.",
    489   "red_flags": [
    490     {
    491       "flag": "No training cutoff dates",
    492       "detail": "Critical for contamination assessment: paper doesn't state when each model's training ended, making it impossible to verify whether benchmark examples were in training data (especially TriviaQA, TruthfulQA)."
    493     },
    494     {
    495       "flag": "No variance or confidence intervals",
    496       "detail": "All results are single point estimates (Tables 1, 3-6). No standard deviations, error bars, or confidence intervals provided despite single evaluation per model."
    497     },
    498     {
    499       "flag": "No statistical power analysis",
    500       "detail": "Sample size of 22 models not justified. No a priori power calculation or post-hoc power analysis to validate adequacy."
    501     },
    502     {
    503       "flag": "Hyperparameters not specified",
    504       "detail": "Only 'zero-shot' mentioned. Temperature, top-p, max_tokens, and other generation parameters not documented, harming reproducibility."
    505     },
    506     {
    507       "flag": "No code release",
    508       "detail": "Reproducibility depends on reverse-engineering LM Evaluation Harness configuration. No custom code or scripts provided."
    509     },
    510     {
    511       "flag": "Contamination unaddressed",
    512       "detail": "Standard benchmarks may have been in model training data; no analysis of whether results reflect memorization vs. true competence."
    513     },
    514     {
    515       "flag": "Limited scope to English",
    516       "detail": "Acknowledged in limitations but severely restricts generalizability. All benchmarks are English-only; linguistic competence for other languages unknown."
    517     },
    518     {
    519       "flag": "Model snapshot precision",
    520       "detail": "Model names and parameter counts given but no arxiv IDs, commit hashes, or exact training cutoffs for version identification."
    521     }
    522   ],
    523   "cited_papers": [
    524     {
    525       "title": "TinyStories: How Small Can Language Models Be and Still Speak Coherent English?",
    526       "relevance": "Directly supports core thesis that smaller models can achieve linguistic competence (Eldan & Li 2023)."
    527     },
    528     {
    529       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    530       "relevance": "Foundational RAG work; FLMs positioned as modular extension of RAG paradigm (Lewis et al. 2020)."
    531     },
    532     {
    533       "title": "A Primer in BERTology: What We Know About How BERT Works",
    534       "relevance": "Prior work on linguistic information distribution in transformers; supports hierarchical encoding claims (Rogers et al. 2021)."
    535     },
    536     {
    537       "title": "Language in Vivo vs. In Silico: Size Matters but Larger Language Models Still Do Not Comprehend Language on a Par with Humans",
    538       "relevance": "Empirical evidence that linguistic competence doesn't scale predictably with size (Dentella et al. 2024)."
    539     },
    540     {
    541       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    542       "relevance": "Key benchmark for internal factual knowledge evaluation (Lin et al. 2022)."
    543     },
    544     {
    545       "title": "BLiMP: A Benchmark of Linguistic Minimal Pairs for English",
    546       "relevance": "Core benchmark for grammatical competence assessment (Warstadt et al. 2020)."
    547     },
    548     {
    549       "title": "Findings of the Second BabyLM Challenge: Sample-Efficient Pretraining on Developmentally Plausible Corpora",
    550       "relevance": "Recent evidence that strong linguistic performance achievable with limited training data in small models (Hu et al. 2024)."
    551     },
    552     {
    553       "title": "When One LLM Drools, Multi-LLM Collaboration Rules",
    554       "relevance": "Agentic AI paradigm; supports modular approach over monolithic scaling (Feng et al. 2025)."
    555     }
    556   ],
    557   "engagement_factors": {
    558     "practical_relevance": {
    559       "score": 2,
    560       "justification": "Provides empirical guidance for modular model design but no working system, toolkit, or deployment examples. Practitioners need more concrete implementation guidance."
    561     },
    562     "surprise_contrarian": {
    563       "score": 2,
    564       "justification": "Challenges 'bigger is better' narrative for linguistics but aligns with BabyLM findings. Moderately novel in quantifying the plateau, not entirely surprising given recent work."
    565     },
    566     "fear_safety": {
    567       "score": 1,
    568       "justification": "Addresses hallucinations and memorization as architectural problems but frames these as design choices, not safety crises. Mild safety relevance."
    569     },
    570     "drama_conflict": {
    571       "score": 0,
    572       "justification": "Straightforward empirical benchmark evaluation. No controversy, competing claims, or dramatic narrative angle."
    573     },
    574     "demo_ability": {
    575       "score": 2,
    576       "justification": "Could demonstrate with public models (Qwen, Llama, Gemma available) and LM Evaluation Harness, but no demo code provided; readers must reconstruct."
    577     },
    578     "brand_recognition": {
    579       "score": 1,
    580       "justification": "Authors from University of Jaén (regional institution, not top-tier). Evaluates famous models (Meta Llama, Alibaba Qwen, Google Gemma) but authored at non-prestigious venue."
    581     }
    582   },
    583   "hn_data": {
    584     "threads": [
    585       {
    586         "hn_id": "25734627",
    587         "title": "AnyDB: An Architecture-Less DBMS for Any Workload",
    588         "points": 76,
    589         "comments": 7,
    590         "url": "https://news.ycombinator.com/item?id=25734627",
    591         "created_at": "2021-01-11T19:16:48Z"
    592       },
    593       {
    594         "hn_id": "46612901",
    595         "title": "HiGP: A high-performance Python package for Gaussian Process",
    596         "points": 5,
    597         "comments": 0,
    598         "url": "https://news.ycombinator.com/item?id=46612901",
    599         "created_at": "2026-01-14T06:11:08Z"
    600       },
    601       {
    602         "hn_id": "28264796",
    603         "title": "How to Fairly Share a Watermelon",
    604         "points": 4,
    605         "comments": 3,
    606         "url": "https://news.ycombinator.com/item?id=28264796",
    607         "created_at": "2021-08-22T12:07:33Z"
    608       },
    609       {
    610         "hn_id": "44477965",
    611         "title": "Establishing Best Practices for Building Rigorous Agentic Benchmarks",
    612         "points": 4,
    613         "comments": 0,
    614         "url": "https://news.ycombinator.com/item?id=44477965",
    615         "created_at": "2025-07-06T04:58:16Z"
    616       },
    617       {
    618         "hn_id": "25548440",
    619         "title": "The Theory of Interstellar Trade",
    620         "points": 3,
    621         "comments": 2,
    622         "url": "https://news.ycombinator.com/item?id=25548440",
    623         "created_at": "2020-12-27T03:02:24Z"
    624       },
    625       {
    626         "hn_id": "28445518",
    627         "title": "Find Bugs in Static Bug Finders",
    628         "points": 2,
    629         "comments": 1,
    630         "url": "https://news.ycombinator.com/item?id=28445518",
    631         "created_at": "2021-09-07T14:46:30Z"
    632       },
    633       {
    634         "hn_id": "44463319",
    635         "title": "Establishing Best Practices for Building Rigorous Agentic Benchmarks",
    636         "points": 2,
    637         "comments": 0,
    638         "url": "https://news.ycombinator.com/item?id=44463319",
    639         "created_at": "2025-07-04T10:46:05Z"
    640       },
    641       {
    642         "hn_id": "2314112",
    643         "title": "Paul Krugman on the \"Theory Of Interstellar Trade\"",
    644         "points": 2,
    645         "comments": 0,
    646         "url": "https://news.ycombinator.com/item?id=2314112",
    647         "created_at": "2011-03-11T17:37:27Z"
    648       },
    649       {
    650         "hn_id": "24426717",
    651         "title": "Kilt: A Benchmark for Knowledge Intensive Language Tasks",
    652         "points": 1,
    653         "comments": 0,
    654         "url": "https://news.ycombinator.com/item?id=24426717",
    655         "created_at": "2020-09-09T22:38:30Z"
    656       }
    657     ],
    658     "top_points": 76,
    659     "total_points": 99,
    660     "total_comments": 13
    661   }
    662 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs