scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29426B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Fundamental Language Models: Does Linguistic Competence Scale with Model Size?",
      6     "authors": [
      7       "Jaime Collado-Montañez",
      8       "L. Alfonso Ureña-López",
      9       "Arturo Montejo-Ráez"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2509.02225",
     14     "doi": "10.48550/arXiv.2509.02225"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Main claims in abstract (linguistic competence improves with scale, internal factual knowledge grows faster) are supported by Table 1 regression analysis and Mann-Whitney tests.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Paper frames findings causally ('model size is more closely tied to memorization') but uses only correlational design (regression, benchmarking). No intervention or causal manipulation performed.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Evaluation limited to English, 7 model families, 135M-32B parameters, but conclusions generalize to 'FLM paradigm', 'all LLMs', 'future language modeling' without qualification of scope.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Limited consideration of alternatives in results; findings presented as direct evidence. Limitations section acknowledges linguistic-factual boundary is fuzzy but doesn't revisit this when interpreting results.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper claims to measure 'linguistic competence' per CEFR framework but uses benchmarks (WiC, BLiMP, RTE) that measure task performance, not direct competence. Gap between benchmark scores and claimed linguistic competence never bridged.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Dedicated 'Limitations' section on page 8 discusses generalization across languages, architectural specificity, English-only evaluation, and fuzzy boundaries between linguistic/factual knowledge.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Notes specific threats: 'Our sample of languages is English' (not just 'may not generalize'), 'specific model architectures', performance at smaller sizes 'might not generalize across all linguistic tasks'—specific enough to be useful.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Explicitly bounds scope: models 135M-32B, benchmarks via LM Evaluation Harness, English language, specific model families (SmolLM2, Qwen, Llama, OLMo, Falcon, Gemma, Yi).",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Acknowledgments section lists Spanish Government funding (CONSENSO, MODERATES, SocialTOX projects) and EU NextGenerationEU funding.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors affiliated with Computer Science Department, University of Jaén, Spain—no affiliation with model vendors being evaluated.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Spanish Government funding for general NLP/linguistic research. Funder is independent of evaluated models; no vendor funding detected.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement provided. No disclosure of patents, equity, or consulting relationships.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Linguistic competence precisely defined via CEFR framework (lexical, grammatical, semantic); factual knowledge split into external (reasoning over context) and internal (memorization). FLM paradigm described.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Opening sentence: 'This paper introduces and empirically supports the Fundamental Language Model (FLM) paradigm.' Contribution is empirical investigation of whether linguistic competence decouples from factual knowledge at different scales.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 systematically engages with transformer linguistics (Rogers et al., Garnier-Brun), BabyLM challenge findings, Sapir-Whorf hypothesis, RAG systems, agentic AI paradigm, and scaling law literature.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code repository, GitHub link, or release statement. Uses LM Evaluation Harness but doesn't provide own code or scripts.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All benchmarks used are publicly available (WiC, BLiMP, RTE, MNLI, QQP, LAMBADA, BoolQ, COPA, MultiRC, ReCoRD, TriviaQA, TruthfulQA). Standard public benchmarks, no custom dataset.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Only states 'two NVIDIA Ampere A100 GPUs'. Missing: Python version, LM Evaluation Harness version, CUDA/cuDNN versions, other dependencies. Insufficient for reproduction.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "States 'we follow standard evaluation protocols implemented within the LM Evaluation Harness' but provides no step-by-step instructions, no command-line examples, no configuration details.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 1 reports single point estimates per model. No error bars, confidence intervals, or multiple runs mentioned. No variance quantified.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Mann-Whitney U tests applied to compare size categories (Q1 vs Q3, Small vs Medium, Medium vs Large) with α=0.05 significance level (Table 2). Proper non-parametric test for non-normal distributions.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Linear regression slopes reported (0.059 for IFK, 0.029 for linguistic competence), R² values provided (0.81 vs 0.50), performance differences quantified in tables. Effect sizes present but could be clearer.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "22 models evaluated. No power analysis, no justification for why this particular set of models or this size of sample.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Single point estimate per model with no error bars, confidence intervals, or standard deviations. No multiple runs or variance quantification across seeds/runs.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Models compared to each other, but no external baselines (e.g., random, human performance, or standard reference models). Evaluation is within-dataset comparison only.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "All compared models are from 2024-2025 (SmolLM2, Qwen2.5, Llama-3, OLMo-2, Falcon3, Gemma-2, Yi-1.5). Recent and contemporary.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "No systematic component ablation. Paper varies model size and evaluates across competencies, but this is comparison not ablation—no removal of specific components to isolate effects.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Multiple benchmarks per competence: linguistic (WiC, BLiMP, RTE, MNLI, QQP), external (LAMBADA, BoolQ, COPA, MultiRC, ReCoRD), internal (TriviaQA, TruthfulQA variants). Table 3-6 show per-task breakdowns.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation of model outputs. Benchmarks use automated metrics (accuracy, F1, BLEU, ROUGE). TruthfulQA has human-authored content but model responses scored automatically.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Using established benchmarks which have standard train/test splits. Implicitly using test sets but not explicitly confirmed for each benchmark.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Breakdowns by competence type (linguistic, external, internal), sub-competence (lexical, grammatical, semantic in Table 3), and per-benchmark (Tables 4-6). Detailed categorical analysis provided.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "Discusses diminishing returns beyond 5-7B parameters but no analysis of specific failure modes, error patterns, or per-task failure analysis.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Reports null findings: Mann-Whitney U shows Medium vs Large has no significant difference (p=0.062 for linguistic competence, Table 2). Diminishing returns beyond certain model size explicitly noted.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model names provided (e.g., 'Qwen2.5-32B', 'Llama-3.2-1B'). 2024-2025 models referenced. However, no snapshot/download dates or exact commit hashes provided.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "No actual prompts shown. States 'follow the standard evaluation protocols implemented within the LM Evaluation Harness' but provides no prompt text.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "'Zero-shot setting' specified. Missing: temperature, top-p, max_tokens, sampling strategy, number of samples per prompt. Appears to use framework defaults without reporting.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "Evaluating base models without agentic scaffolding. Criterion not applicable.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Using standard public benchmarks 'as-is'. No custom preprocessing described because evaluating on unmodified public datasets. Minimal preprocessing expected and not detailed.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "All benchmarks are publicly available (WiC, BLiMP, RTE, MNLI, QQP, LAMBADA, BoolQ, COPA, MultiRC, ReCoRD, TriviaQA, TruthfulQA). Raw benchmark data can be downloaded.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": false,
    273           "answer": false,
    274           "justification": "No new data collected—using only existing public benchmarks. Data collection criterion not applicable.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Criterion not applicable.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "States 'standard evaluation protocols implemented within the LM Evaluation Harness' but provides no detailed pipeline documentation—relying on framework documentation rather than own specification.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Models are from 2024-2025 but exact training cutoffs not stated. Benchmarks are public but their inclusion in training data not discussed.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No explicit discussion of whether benchmarks appeared in model training data. Public benchmarks were likely included in training corpora of modern models.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No acknowledgment or mitigation of potential contamination risk. Public benchmarks likely in training data of modern LLMs but no testing or discussion.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants. Not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. Not applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants. Not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants. Not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants. Not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants. Not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants. Not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "Only hardware stated ('two NVIDIA Ampere A100 GPUs'). No inference latency, cost per token, throughput, or wall-clock time reported.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "GPU hardware mentioned but no total compute budget, GPU-hours, or cost. Incomplete practicality assessment.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Linguistic competence stabilizes at moderate model sizes (5-7B parameters) while internal factual knowledge continues scaling",
    373       "evidence": "Table 1 shows Qwen2.5-7B (0.7239) vs Qwen2.5-32B (0.7688) linguistic competence difference of only 6.2%, whereas internal factual knowledge grows from 0.3796 to 0.4288 (13% gain). Linear regression R²=0.50 for linguistic vs R²=0.81 for internal.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Internal factual knowledge scales with model size at roughly 2x the rate of linguistic competence",
    378       "evidence": "Regression slopes: internal factual knowledge slope=0.059 vs linguistic competence slope=0.029, a 2.03x difference (Section 4.1.1, Figure 2).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Model size explains 80% of variance in internal factual knowledge but only 50% for linguistic competence",
    383       "evidence": "Section 4.1.1: 'model size explains over 80% of the variance in internal factual knowledge scores (R² = 0.81), but only about 50% for linguistic competence.' Explicitly stated.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "External factual knowledge does not improve significantly beyond 9-10B parameters",
    388       "evidence": "Table 5: Gemma-2-9b achieves 0.7961, Falcon3-10B 0.7746, both outperforming Qwen2.5-32B (0.7007). Mann-Whitney U shows no significant improvement from Medium to Large (p=0.591).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "The FLM paradigm (separating linguistic competence from factual knowledge into modular systems) is viable",
    393       "evidence": "Shows linguistic competence stabilizes at moderate sizes, supporting use of smaller models for language processing. However, no actual FLM systems tested—only benchmarking, not deployment validation.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Linguistic competence is more dependent on architecture and training data than on model size",
    398       "evidence": "Discussion section notes 'Qwen2.5-3B remains close to the largest models evaluated: 0.6909 as compared to OLMo-2-0325-32B, which scored 0.7095' (page 5), suggesting architecture matters. But only 7 model families tested—limited evidence.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "observational"
    405   ],
    406   "key_findings": "Evaluation of 22 language models (135M–32B parameters) across 12 benchmarks reveals that linguistic competence (lexical, grammatical, semantic) stabilizes at 5–7B parameters with diminishing returns beyond that scale, while internal factual knowledge continues to improve significantly with model size. Linear regression analysis shows model size explains 80% of variance in factual memorization but only 50% in linguistic competence, with a 2x steeper scaling slope for memorization. These findings support a modular language model architecture where compact linguistically-proficient models delegate factual retrieval to external systems, though no actual FLM systems were tested.",
    407   "red_flags": [
    408     {
    409       "flag": "Proxy-outcome mismatch",
    410       "detail": "Claims to measure 'linguistic competence' per CEFR framework (vocabulary, grammar, meaning in context) but uses benchmark accuracy scores (WiC, BLiMP, RTE) as proxies. Gap between benchmark performance and true linguistic competence never bridged; benchmarks may reflect task-specific optimization rather than competence."
    411     },
    412     {
    413       "flag": "Generalization overreach",
    414       "detail": "Evaluation limited to English-only, 7 model families, 135M–32B range, specific 12-benchmark set; conclusions generalize to 'FLM paradigm', 'future language modeling', 'all LLMs' without explicit scope caveats in main text."
    415     },
    416     {
    417       "flag": "Causal framing without causal design",
    418       "detail": "States 'model size is more closely tied to memorization than to core language ability' (p.5) and 'internal factual knowledge is highly dependent on model size' (p.7) but uses only correlational benchmarking. No intervention or counterfactual comparison performed."
    419     },
    420     {
    421       "flag": "No variance reported across runs",
    422       "detail": "Single point estimates per model with no error bars, confidence intervals, standard deviations, or indication of run-to-run variation. Cannot assess measurement uncertainty."
    423     },
    424     {
    425       "flag": "Benchmark contamination unaddressed",
    426       "detail": "Public benchmarks (WiC, RTE, TriviaQA, etc.) likely included in training data of 2024–2025 models. No explicit discussion or mitigation of train/test overlap. Training cutoff dates not stated."
    427     },
    428     {
    429       "flag": "Reproduction details insufficient",
    430       "detail": "No released code, no prompts provided, no LM Evaluation Harness version specified, no hyperparameters (temperature, top-p, max_tokens) reported. 'Follow standard protocols' is too vague to reproduce."
    431     },
    432     {
    433       "flag": "No external baselines or human validation",
    434       "detail": "All evaluation via automated metrics on benchmarks. No human assessment of model linguistic competence. No comparison to human performance or external gold standards."
    435     },
    436     {
    437       "flag": "Sample size unjustified",
    438       "detail": "22 models evaluated with no power analysis or justification for sample size. Why these specific 7 model families? Why not more recent checkpoints or ablated variants?"
    439     },
    440     {
    441       "flag": "Training cutoff not stated",
    442       "detail": "Models from 2024–2025 but exact training data cutoff dates not disclosed. Cannot verify whether benchmark contamination occurred. Essential for interpreting results on public benchmarks."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "TinyStories: How small can language models be and still speak coherent English?",
    448       "authors": "Eldan & Li",
    449       "year": 2023,
    450       "relevance": "Foundational work supporting viability of small linguistically-competent models; directly cited as motivation for FLM hypothesis."
    451     },
    452     {
    453       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    454       "authors": "Lewis et al.",
    455       "year": 2020,
    456       "relevance": "Foundational RAG work that inspired the idea of separating factual retrieval from language modeling; core concept for FLM systems."
    457     },
    458     {
    459       "title": "Scaling Laws for Neural Language Models",
    460       "authors": "Kaplan et al.",
    461       "year": 2020,
    462       "relevance": "Foundational scaling laws work; motivates investigation of differential scaling between linguistic competence and memorization."
    463     },
    464     {
    465       "title": "A Primer in BERTology: What BERT Learns About the Structure of Language",
    466       "authors": "Rogers, Kovaleva, Rumskiy",
    467       "year": 2021,
    468       "relevance": "Analysis of how transformer layers encode linguistic information hierarchically; relevant to understanding what smaller models retain."
    469     },
    470     {
    471       "title": "On the scaling limits of LLMs for logical reasoning",
    472       "authors": "Lin et al.",
    473       "year": 2025,
    474       "relevance": "Recent work showing reasoning may saturate with scale; supports claim that not all capabilities scale linearly with model size."
    475     },
    476     {
    477       "title": "Language in vivo vs. in silico: Size matters but larger language models still do not comprehend language on a par with humans",
    478       "authors": "Dentella et al.",
    479       "year": 2024,
    480       "relevance": "Empirical evidence that linguistic competence in LLMs plateaus; questions whether scaling alone improves language understanding."
    481     },
    482     {
    483       "title": "A Survey on Evaluation of Large Language Models",
    484       "authors": "Chang et al.",
    485       "year": 2024,
    486       "relevance": "Comprehensive survey of LLM evaluation methodologies and benchmarks; contextualizes this work's benchmark selection."
    487     },
    488     {
    489       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    490       "authors": "Schaeffer et al.",
    491       "year": 2023,
    492       "relevance": "Challenges narrative that emergent abilities appear suddenly with scale; relevant to nuancing claims about size-dependent improvement."
    493     }
    494   ],
    495   "engagement_factors": {
    496     "practical_relevance": {
    497       "score": 2,
    498       "justification": "Findings could guide model design (build smaller linguistic models), but no practical FLM systems built or deployed; theoretical contribution only."
    499     },
    500     "surprise_contrarian": {
    501       "score": 2,
    502       "justification": "Challenges 'bigger is better' scaling narrative in one dimension (linguistic competence), but builds incrementally on prior work (Eldan & Li 2023, scaling laws)."
    503     },
    504     "fear_safety": {
    505       "score": 1,
    506       "justification": "Discusses hallucinations and biases as motivation but doesn't deeply engage with AI safety concerns; tangential rather than core focus."
    507     },
    508     "drama_conflict": {
    509       "score": 1,
    510       "justification": "Modest methodological controversy about scaling narratives; no funding conflicts, ethical drama, or social amplification angle."
    511     },
    512     "demo_ability": {
    513       "score": 2,
    514       "justification": "All evaluated models are publicly downloadable and models are standard, but authors provide no code or reproduction scripts; would require reimplementation."
    515     },
    516     "brand_recognition": {
    517       "score": 1,
    518       "justification": "University of Jaén (non-major AI research brand); uses well-known models but introduces no novel models, datasets, or tools."
    519     }
    520   },
    521   "hn_data": {
    522     "threads": [
    523       {
    524         "hn_id": "25734627",
    525         "title": "AnyDB: An Architecture-Less DBMS for Any Workload",
    526         "points": 76,
    527         "comments": 7,
    528         "url": "https://news.ycombinator.com/item?id=25734627",
    529         "created_at": "2021-01-11T19:16:48Z"
    530       },
    531       {
    532         "hn_id": "46612901",
    533         "title": "HiGP: A high-performance Python package for Gaussian Process",
    534         "points": 5,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=46612901",
    537         "created_at": "2026-01-14T06:11:08Z"
    538       },
    539       {
    540         "hn_id": "28264796",
    541         "title": "How to Fairly Share a Watermelon",
    542         "points": 4,
    543         "comments": 3,
    544         "url": "https://news.ycombinator.com/item?id=28264796",
    545         "created_at": "2021-08-22T12:07:33Z"
    546       },
    547       {
    548         "hn_id": "44477965",
    549         "title": "Establishing Best Practices for Building Rigorous Agentic Benchmarks",
    550         "points": 4,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44477965",
    553         "created_at": "2025-07-06T04:58:16Z"
    554       },
    555       {
    556         "hn_id": "25548440",
    557         "title": "The Theory of Interstellar Trade",
    558         "points": 3,
    559         "comments": 2,
    560         "url": "https://news.ycombinator.com/item?id=25548440",
    561         "created_at": "2020-12-27T03:02:24Z"
    562       },
    563       {
    564         "hn_id": "28445518",
    565         "title": "Find Bugs in Static Bug Finders",
    566         "points": 2,
    567         "comments": 1,
    568         "url": "https://news.ycombinator.com/item?id=28445518",
    569         "created_at": "2021-09-07T14:46:30Z"
    570       },
    571       {
    572         "hn_id": "44463319",
    573         "title": "Establishing Best Practices for Building Rigorous Agentic Benchmarks",
    574         "points": 2,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=44463319",
    577         "created_at": "2025-07-04T10:46:05Z"
    578       },
    579       {
    580         "hn_id": "2314112",
    581         "title": "Paul Krugman on the \"Theory Of Interstellar Trade\"",
    582         "points": 2,
    583         "comments": 0,
    584         "url": "https://news.ycombinator.com/item?id=2314112",
    585         "created_at": "2011-03-11T17:37:27Z"
    586       },
    587       {
    588         "hn_id": "24426717",
    589         "title": "Kilt: A Benchmark for Knowledge Intensive Language Tasks",
    590         "points": 1,
    591         "comments": 0,
    592         "url": "https://news.ycombinator.com/item?id=24426717",
    593         "created_at": "2020-09-09T22:38:30Z"
    594       }
    595     ],
    596     "top_points": 76,
    597     "total_points": 99,
    598     "total_comments": 13
    599   }
    600 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs