scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30376B)
      1 {
      2   "paper": {
      3     "title": "LaCy: What Small Language Models Can and Should Learn is Not Just a Question of Loss",
      4     "authors": [
      5       "Szilvia Ujváry",
      6       "Louis Béthune",
      7       "Pierre Ablin",
      8       "João Monteiro",
      9       "Marco Cuturi",
     10       "Michael Kirchhof"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.12005"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "LaCy combines spaCy grammar parsing with loss signals to decide which tokens an SLM should learn vs delegate to a larger model via a <CALL> token. On a 334M parameter model cascading with Llama 3.2 1B, LaCy achieves 22.71% FactScore vs 15.89% baseline and the lowest fact leakage (11.28%). The key insight is that cross-entropy loss is blind to token type: high-loss non-factual tokens are often acceptable alternatives, while high-loss factual tokens cause hallucinations. The method's spaCy annotation runs on CPU only, making it cheaper than GPU-based alternatives like LLM judge annotations.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The dwiki dataset is publicly available from the OLMo2 project (Groeneveld et al., 2024). The LLM judge annotations are available at 'kilian-group/LMLM-pretrain-dwiki6.1M' (Section A.2.2). FactScore and NLU benchmarks are standard public datasets."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Table 3 and 4 describe architecture and hyperparameters, and Section A.4 mentions 8 A100-80GB GPUs and bfloat16 precision. However, there is no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results (Figures 2, 6, 7; Tables 1, 6) report point estimates only with no confidence intervals, error bars, or ± notation."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Claims like 'LaCy outperforms all previous methods' (Section 5.2) are based solely on comparing point estimates without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 5.2 reports 'an increase of 6.88% compared to the baseline' for FactScore. Figures 2 and 11-12 show absolute values for all methods, providing context for the magnitude of improvements."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for sample sizes: the acceptability experiment uses 112 documents (~44k tokens), FactScore uses 183 entities, and NLU uses standard benchmarks. No power analysis or sample size reasoning is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. All results appear to be single-run numbers."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 5.1 describes four baselines: vanilla Baseline (no CALL), Loss-based calls, LLM judge (Zhao et al., 2025), and Rho-1 (Lin et al., 2024). All are reimplemented with the same budgets and data."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include Rho-1 (Lin et al., 2024) and LLM judge (Zhao et al., 2025), which are recent and represent the state of the art in token selection for pretraining."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 5.4 presents five ablations: spaCy only, spaCy + Reference Model, LaCy + Ignorefacts, LaCy + Ignore. Figure 6 and Table 5 show the contribution of each component."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Results are reported on FactScore (factual precision), Fact Leakage (factual QA without calling), NLU benchmarks (ARC-Easy, HellaSwag, PIQA, SIQA), and validation losses (call/non-call/total)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated: FactScore uses GPT-3.5 turbo, NLU uses log-likelihood comparison, and the acceptability analysis uses Gemini 2.0 Flash as judge. No human evaluation of generated outputs is included."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section A.4.3 states '10% of the dwiki dataset as validation set.' FactScore uses a fixed set of 183 entities (Section B). NLU benchmarks use their standard test sets."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 1 breaks down NLU results by benchmark (ARC-Easy, HellaSwag, PIQA, SIQA). Figure 5 separates call vs non-call losses. Fact leakage is broken down by benchmark (BigBench QA Wikidata, PopQA long-tail)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 acknowledges 'the model sometimes tries to predict factual tokens it should not.' Figure 4 shows example generations with factual errors highlighted in red. Section 5.2 notes 'not all facts inserted by the cascade partner are true.'"
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 6 shows LaCy+Ignore degrades NLU performance. Section 5.7 shows loss is not correlated with FactScore. Figure 10 shows ablation improvements disappear when equalizing forward steps. Section 5.5 reports that freeing capacity by offloading facts does not improve NLU."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims higher FactScores and outperformance of Rho and LLM-judge SLMs. Figure 2 confirms LaCy achieves 22.71% vs Rho-1 21.63% and LLM judge 20.97%. The claim of being 'simpler and cheaper' is supported by Table 2."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims ('LaCy models successfully learn which tokens to predict') are supported by controlled experiments: all methods use the same architecture, data, and compute budget, varying only the token selection mechanism. Ablations in Section 5.4 isolate the contribution of each component."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'What Small Language Models Can and Should Learn' is broad, but all experiments are on Wikipedia domain only (dwiki dataset), with GPT-2 architectures at 334M and 1.3B scale. Section 6 calls it 'an explorative pilot study' but the title and framing do not bound claims to this narrow setting."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not substantively discuss alternative explanations for why LaCy outperforms baselines. Section 5.4 ablates components but does not consider whether improvements could be due to confounds such as the specific grammar parser, the dwiki domain, or the particular cascade model."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper carefully distinguishes between its measurements (FactScore, fact leakage, NLU benchmarks) and their interpretation. Section 5.7 explicitly warns that validation loss is not a valid proxy for factual accuracy in token-selection settings. The paper does not overclaim from its proxies."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "GPT-2 architectures are trained from scratch with full specifications in Table 3. Cascade models are specified as 'Llama-3.2-1B' and 'Qwen 3 32B' with references to model cards and technical reports. Gemini 2.0 Flash (used only for acceptability analysis) and GPT-3.5 turbo (for FactScore) are also named."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Full acceptability judgment prompt is provided in Section A.1. RAG prompt for Qwen 3 32B is provided in Section A.5. FactScore prompt template is given in Section B: 'Tell me a bio of <name>. <name> is'. PopQA rephrasing approach is described."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Table 4 reports batch size, total steps, learning rate, warmup, and precision for all model sizes. Section A.4 specifies AdamW with weight decay 0.1, gradient accumulation across 4 steps, context length 1024. Section A.5 specifies greedy decoding with repetition penalty 1.2."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The cascade mechanism (SLM generates until CALL token, then cascade model generates one token) is simple and fully described in Section A.5, but this is not scaffolding in the agentic sense."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section A.2.1 provides detailed documentation of the spaCy annotation pipeline: named entity processing, supplementary entity detection with specific heuristics, grammatical word classification, and tokenization. Section A.2.2 documents the LMLM data processing conversion."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 (Conclusion) contains two substantive paragraphs discussing limitations: small-scale training may cause the model to predict factual tokens it shouldn't, and the paper excludes the question of what to do after calling. While not a separate section, it is substantive."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 identifies specific threats: (1) 'The model sometimes tries to predict factual tokens it should not, which we believe is mostly because it was trained at a small scale,' and (2) the exclusion of the downstream handling of CALL tokens. These are specific to this study."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While Section 6 calls this 'an explorative pilot study' and Section 1 states 'we focus fully on the first question,' the paper does not explicitly state what its results do NOT show — e.g., it doesn't bound claims to the Wikipedia domain, GPT-2 architectures, or the specific model scales tested."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The dwiki dataset is public, but experimental outputs (model generations, FactScore evaluations, spaCy annotations, trained model weights) are not released for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section A.2 describes the dwiki dataset source (OLMo2 project, ~3B tokens of Wikipedia). The acceptability experiment uses 112 validation documents covering ~44k tokens (Section 3). Data processing steps are documented."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. All data comes from standard public datasets (dwiki, FactScore entities, NLU benchmarks)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Section A.2.1 documents the full annotation pipeline from raw text to token-level labels in detailed steps (named entity processing → supplementary detection → grammatical classification → tokenization). Section A.2.2 documents the LMLM conversion pipeline."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding disclosure. Section 7 (Acknowledgements) thanks individuals but does not mention any funding sources or grants. Apple is listed as primary affiliation but no explicit funding statement."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: 'Apple, †University of Cambridge, work done as an intern at Apple.' The Apple trademark notice is also included."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Apple has a commercial interest in SLM technology (Apple Intelligence, on-device models). The research was conducted at Apple, and Apple benefits from demonstrating that SLMs can be effective with appropriate training methods. This conflict is not acknowledged."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement is present in the paper. Apple employees researching SLM effectiveness for a company that deploys SLMs commercially represents an undisclosed potential financial interest."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The SLMs are trained from scratch on dwiki, so their training data is known. However, the cascade model Llama 3.2 1B (whose outputs directly affect FactScore) has no training cutoff stated. FactScore uses GPT-3.5 turbo with no cutoff mentioned either."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of potential overlap between dwiki training data and evaluation benchmarks (FactScore entities, NLU test sets). The SLM is trained on Wikipedia and evaluated on biography generation from Wikipedia — the overlap is obvious but unaddressed."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "NLU benchmarks (ARC-Easy, HellaSwag, PIQA, SIQA) and QA benchmarks (BigBench, PopQA) were all published before the models' training. No contamination analysis is provided for either the SLM or the cascade partner Llama 3.2 1B."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "Table 2 reports preprocessing overhead for label generation, but inference cost/latency for the cascade system (SLM + Llama 3.2 1B) is not reported. The added cost of cascading (querying a larger model ~22% of tokens) is not quantified."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section A.4: 'train most of our models on 8 A100-80GB GPUs' and 'Training on 8 GPUs finishes in 3 days.' Table 2 reports preprocessing overhead (152 h/1B tokens on CPU for LaCy). Table 4 specifies total training steps."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. All results appear to be from single training runs."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The learning rate, batch size, and other settings appear chosen without documented search. The CALL percentage (15%) is matched to the LLM judge baseline for fairness but not searched."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "The 15% call rate is explicitly justified as matching the LLM judge baseline for fair comparison (Section 4). Training steps are calibrated to equalize gradient signals (Section A.4). Hyperparameters follow standard practices from prior work."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Multiple comparisons are made across 5 methods, 6+ metrics, and 5 ablation variants. No statistical tests are performed at all, let alone corrections for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare LaCy against their own reimplementations of all baselines (Loss-based, Rho-1, LLM judge). Section 5.1 notes they 'reimplement and pretrain these methods' but does not acknowledge the potential bias of evaluating their own system against their own baseline implementations."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Section 5.4 and Figure 10 compare performance when equalizing training steps. Table 2 reports preprocessing overhead. Section A.4 explains that calling models train 15% longer to equalize the number of tokens receiving gradient signals."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "FactScore is used to measure factual precision. Section 5.7 questions loss as a metric but does not question whether FactScore adequately captures the claimed capability of factual accuracy in generation. FactScore relies on GPT-3.5 turbo for fact verification, which itself can err."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "All methods use the same cascade setup with Llama 3.2 1B, the same call budget (22% of tokens), and the same decoding strategy. Section A.5 describes the cascade protocol applied uniformly. The scaffold is controlled across comparisons."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "No discussion of whether the dwiki training data contains information that could leak into benchmarks temporally. The SLM is trained on Wikipedia which contains factual information also tested by FactScore."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of feature leakage. The FactScore evaluation uses Wikipedia pages as ground truth, and the model is trained on Wikipedia data (dwiki) — the feature overlap between training and evaluation is not addressed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Training (dwiki, a Wikipedia subset from OLMo2) and evaluation (FactScore biography generation verified against Wikipedia) draw from the same source (Wikipedia). This obvious non-independence is not discussed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is applied. No checks for overlap between dwiki training documents and FactScore evaluation entities."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "LaCy achieves the highest FactScore (22.71%) among all methods when cascading with Llama 3.2 1B, a 6.88 percentage point improvement over the no-call baseline (15.89%).",
    371       "evidence": "Figure 2 (left) and Section 5.2. LaCy 22.71%, Rho-1 21.63%, LLM judge 20.97%, Loss-based 19.44%, Baseline 15.89%. Single-run results on 183 FactScore entities.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "LaCy achieves the lowest fact leakage (11.28%) on factual QA benchmarks, meaning the least factual knowledge was trained into the SLM's parametric memory.",
    376       "evidence": "Figure 2 (right). LaCy 11.28%, LLM judge 14.25%, Rho-1 15.91%, Baseline 18.45%, Loss-based 19.55%. Evaluated on BigBench QA Wikidata and PopQA long-tail.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Cross-entropy loss is blind to the type of error: non-factual tokens with high loss are often acceptable alternative continuations, while high-loss factual tokens are not.",
    381       "evidence": "Section 3 and Figure 3 (right). Acceptability experiment on a single batch of 112 documents (~44k tokens) with Gemini 2.0 Flash as judge, showing non-facts have higher acceptability than facts at the same loss levels.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Factual offloading does not significantly degrade Natural Language Understanding.",
    386       "evidence": "Table 1. LaCy achieves 39.9% average NLU vs 39.6% baseline across ARC-Easy, HellaSwag, PIQA, SIQA. Differences are less than 2 percentage points across all methods.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Validation loss is not correlated with FactScore in the token-selection setting.",
    391       "evidence": "Section 5.7 and Figure 7. Neither call loss, non-call loss, nor total loss is predictive of FactScore across the compared methods.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "LaCy's spaCy-based labeling has minimal throughput overhead and runs on CPU, not GPU.",
    396       "evidence": "Table 2. LaCy: 152 h/1B tokens on a single CPU core. LLM judge: 233 h/1B tokens on a single A100 GPU. Rho-1: 56 h/1B tokens on a single A100 GPU.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "The loss signal is necessary: spaCy annotation alone (without loss thresholding) performs worse than LaCy.",
    401       "evidence": "Section 5.4 and Figure 6. spaCy only achieves lower FactScore and higher fact leakage than full LaCy.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No error bars or uncertainty quantification",
    408       "detail": "All results across all experiments (FactScore, fact leakage, NLU, validation losses, ablations) are single-point estimates with no confidence intervals, standard deviations, or indication of variance across runs. It is impossible to assess whether the differences between methods are statistically meaningful."
    409     },
    410     {
    411       "flag": "Apparent single-run experiments",
    412       "detail": "No mention of multiple random seeds or repeated runs anywhere in the paper. Given that the differences between methods are modest (e.g., LaCy 22.71% vs Rho-1 21.63% on FactScore, a 1.08pp gap), a single run provides no evidence that these differences are robust."
    413     },
    414     {
    415       "flag": "Undisclosed conflict of interest",
    416       "detail": "All Apple-affiliated authors research SLM methods at a company that commercially deploys SLMs (Apple Intelligence). The paper promotes SLM capability through delegation, which aligns with Apple's product strategy. No funding disclosure or competing interests statement is present."
    417     },
    418     {
    419       "flag": "Training-evaluation domain overlap",
    420       "detail": "The SLM is trained on dwiki (Wikipedia data) and evaluated on FactScore (biography generation verified against Wikipedia). This domain overlap is never discussed, raising concerns about whether the approach generalizes beyond Wikipedia-style content."
    421     },
    422     {
    423       "flag": "Self-reimplemented baselines",
    424       "detail": "All baselines (Loss-based, Rho-1, LLM judge) are reimplemented by the authors (Section 5.1). While using the same budgets and data is fair, the authors' implementations of competing methods may not match the original authors' best configurations, per Lucic et al. (2018)."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Physics of language models: part 3.1, knowledge storage and extraction",
    430       "authors": ["Zeyuan Allen-Zhu", "Yuanzhi Li"],
    431       "year": 2024,
    432       "relevance": "Foundational work on LLM knowledge storage capacity limits, which motivates the token delegation approach."
    433     },
    434     {
    435       "title": "Small language models are the future of agentic AI",
    436       "authors": ["Peter Belcak", "Greg Heinrich", "Shizhe Diao", "Yonggan Fu", "Xin Dong", "Saurav Muralidharan", "Yingyan Celine Lin", "Pavlo Molchanov"],
    437       "year": 2025,
    438       "arxiv_id": "2506.02153",
    439       "relevance": "Argues for SLMs as the future of agentic AI, directly relevant to why SLM capability and delegation matter."
    440     },
    441     {
    442       "title": "Prioritized training on points that are learnable, worth learning, and not yet learnt",
    443       "authors": ["Sören Mindermann", "Jan M Brauner", "Muhammed T Razzak"],
    444       "year": 2022,
    445       "relevance": "Introduces Rho-loss for prioritized training, a key baseline and predecessor to the token selection approach."
    446     },
    447     {
    448       "title": "Not all tokens are what you need for pretraining",
    449       "authors": ["Zhenghao Lin", "Zhibin Gou", "Yeyun Gong"],
    450       "year": 2024,
    451       "relevance": "Proposes Rho-1, a key baseline that selects tokens based on loss difference with a reference model."
    452     },
    453     {
    454       "title": "Pre-training large memory language models with internal and external knowledge",
    455       "authors": ["Linxi Zhao", "Sofian Zalouk", "Christian K. Belardi"],
    456       "year": 2025,
    457       "arxiv_id": "2505.15962",
    458       "relevance": "Key baseline (LMLM/LLM judge) that uses GPT-4o to annotate factual tokens for delegation during pretraining."
    459     },
    460     {
    461       "title": "I don't know: Explicit modeling of uncertainty with an [IDK] token",
    462       "authors": ["Roi Cohen", "Konstantin Dobler", "Eden Biran", "Gerard de Melo"],
    463       "year": 2024,
    464       "relevance": "Proposes shifting logits onto an IDK token when models are wrong, related to the CALL token delegation approach."
    465     },
    466     {
    467       "title": "Toolformer: Language models can teach themselves to use tools",
    468       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi"],
    469       "year": 2023,
    470       "relevance": "Foundational work on teaching LMs to use external tools, the broader paradigm that LaCy's CALL mechanism participates in."
    471     },
    472     {
    473       "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation",
    474       "authors": ["Sewon Min", "Kalpesh Krishna", "Xinxi Lyu"],
    475       "year": 2023,
    476       "doi": "10.18653/v1/2023.emnlp-main.741",
    477       "relevance": "The primary evaluation metric used in this paper for measuring factual accuracy of generated text."
    478     },
    479     {
    480       "title": "How much do language models memorize?",
    481       "authors": ["John X. Morris", "Chawin Sitawarin", "Chuan Guo"],
    482       "year": 2025,
    483       "arxiv_id": "2505.24832",
    484       "relevance": "Studies LLM memorization limits as a function of parameters, directly motivating the capacity argument for SLM delegation."
    485     },
    486     {
    487       "title": "Scaling laws for neural language models",
    488       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    489       "year": 2020,
    490       "arxiv_id": "2001.08361",
    491       "relevance": "Foundational scaling laws work; the paper argues that findings linking loss with downstream performance do not transfer to the token-selection setting."
    492     },
    493     {
    494       "title": "Learning to route LLMs with confidence tokens",
    495       "authors": ["Yu-Neng Chuang", "Prathusha Kameswara Sarma", "Parikshit Gopalan"],
    496       "year": 2025,
    497       "relevance": "Proposes analyzing which tokens a trained model is wrong on for routing, related to the token selection problem."
    498     },
    499     {
    500       "title": "Self-RAG: Learning to retrieve, generate, and critique through self-reflection",
    501       "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang"],
    502       "year": 2024,
    503       "relevance": "Self-reflective retrieval-augmented generation, related to the model cascade and delegation approach."
    504     }
    505   ]
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs