scan.json (19706B)
1 { 2 "paper": { 3 "title": "Position: Require Frontier AI Labs To Release Small \"Analog\" Models", 4 "authors": ["Shriyash Upadhyay", "Chaithanya Bandi", "Narmeen Oozeer", "Philip Quirke"], 5 "year": 2025, 6 "venue": "NeurIPS 2025 Position Paper Track", 7 "arxiv_id": "2510.14053", 8 "doi": "10.48550/arXiv.2510.14053" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "methodology_tags": ["theoretical"], 13 "key_findings": "This position paper proposes mandating frontier AI labs release small distilled 'analog models' (0.5-5% of frontier model parameters) as public proxies for safety research. The authors synthesize evidence that safety interventions transfer from small to large models (weak-to-strong generalization, representational convergence), estimate compliance costs at ~$93K (~0.1% of frontier training cost), and argue the policy relaxes the safety-innovation tradeoff. They use Meta's LLaMA release as a natural experiment supporting the approach.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": false, 18 "answer": false, 19 "justification": "This is a theoretical position/policy paper with no code artifacts to release." 20 }, 21 "data_released": { 22 "applies": false, 23 "answer": false, 24 "justification": "Position paper with no original data collection or analysis." 25 }, 26 "environment_specified": { 27 "applies": false, 28 "answer": false, 29 "justification": "No computational experiments were conducted." 30 }, 31 "reproduction_instructions": { 32 "applies": false, 33 "answer": false, 34 "justification": "No experiments to reproduce; this is a policy proposal." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": false, 40 "answer": false, 41 "justification": "No original experiments or statistical analyses are conducted. The paper synthesizes results from other studies." 42 }, 43 "significance_tests": { 44 "applies": false, 45 "answer": false, 46 "justification": "No comparative empirical claims requiring significance tests. The paper is a policy argument." 47 }, 48 "effect_sizes_reported": { 49 "applies": false, 50 "answer": false, 51 "justification": "No original experiments. Effect sizes cited are from referenced papers, not original analysis." 52 }, 53 "sample_size_justified": { 54 "applies": false, 55 "answer": false, 56 "justification": "Theoretical/position paper with no samples." 57 }, 58 "variance_reported": { 59 "applies": false, 60 "answer": false, 61 "justification": "No original experimental runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares the analog-model mandate against alternative regulatory approaches: SB-1047, EO 14110, the 6-month AI pause, and unconstrained development (Figure 1B, Section 1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The regulatory comparisons are contemporary — SB-1047 (2024), EO 14110 (2023), EU AI Act (2024)." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "Policy proposal with no system components to ablate." 79 }, 80 "multiple_metrics": { 81 "applies": false, 82 "answer": false, 83 "justification": "No quantitative evaluation of the proposed policy is conducted." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "No system outputs to evaluate." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "No experiments involving test sets." 94 }, 95 "per_category_breakdown": { 96 "applies": false, 97 "answer": false, 98 "justification": "No quantitative results to break down." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.3 discusses multiple risk/failure scenarios: IP exposure, dual-use misuse, regulatory burden, substitution risk, second-order policy effects, and limits of transferability." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 4.3 'Uncertainty and Potential Limits to Transferability' explicitly acknowledges that emergent behaviors at frontier scale might not manifest in analog models and that 'foundational research supporting weak-to-strong transferability remains nascent.'" 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract's main claims — that safety interventions transfer from small to large models, and that compliance costs are minimal — are supported in Sections 2 and Table 1 respectively, citing specific papers and cost estimates." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims like the analog mandate 'relaxes the safety-innovation tradeoff' and 'accelerates safety advancements,' but the evidence is drawn from a handful of cited studies on intervention transfer plus the LLaMA case study. No causal identification strategy connects the proposed policy to its claimed outcomes." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper generalizes from a few intervention-transfer studies (Oozeer et al., Lee et al., Burns et al.) on specific model families (Qwen, Llama, GPT-2) to a universal policy for all frontier models. The title and abstract make broad claims without bounding to the tested architectures and safety properties." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 4.3 discusses alternative views: that transferability might not hold at frontier scale, that analog models could enable misuse, that the policy could displace other safety initiatives, and that IP concerns could outweigh benefits." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper's central thesis is that small analog models are valid proxies for frontier models in safety research. However, it does not rigorously discuss the gap between what analog models can reveal versus what frontier-scale behaviors actually look like. The limits of the proxy relationship are mentioned only briefly in Section 4.3." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "No models were run by the authors. Model names cited are from referenced studies." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "No prompting was used in this paper." 148 }, 149 "hyperparameters_reported": { 150 "applies": false, 151 "answer": false, 152 "justification": "No experiments conducted by the authors." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": false, 161 "answer": false, 162 "justification": "No data collected or processed." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.3 'Risks and Mitigation Strategies' serves as a substantive limitations discussion with six identified risk categories." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 4.3 includes specific threats: 'Emergent behaviors unique to frontier-scale models might not manifest similarly in analog models' and 'the foundational research supporting weak-to-strong transferability remains nascent, introducing uncertainty.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what its proposal does NOT cover. It does not specify which types of frontier models or safety properties are out of scope. The Future Work section mentions broadening to multimodal but does not frame current scope boundaries." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": false, 185 "answer": false, 186 "justification": "No original data collected. This is a position paper synthesizing existing literature." 187 }, 188 "data_collection_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No data collection procedure — the paper reviews existing published work." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No participants or samples recruited." 197 }, 198 "data_pipeline_documented": { 199 "applies": false, 200 "answer": false, 201 "justification": "No data pipeline exists for this position paper." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosure or acknowledgments section. All authors are affiliated with Martian, a company in the AI space." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All four authors list 'Martian' as their affiliation on the first page." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "All authors are from Martian, an AI company that could benefit from frontier labs being required to release analog models (enabling competitors to study frontier model properties). No discussion of this potential conflict." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement. Authors work at an AI company (Martian) that could be affected by the proposed regulation, and this is not disclosed." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "No pre-trained model evaluated on any benchmark." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "No benchmark evaluation conducted." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No benchmark evaluation conducted." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "Theoretical/position paper; no method with inference costs." 285 }, 286 "compute_budget_stated": { 287 "applies": false, 288 "answer": false, 289 "justification": "No computational experiments conducted by the authors." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "Safety and interpretability interventions discovered in small models reliably transfer to larger frontier models.", 296 "evidence": "Section 2.1 cites Oozeer et al. (2025) showing steering vectors from 0.5B-1B models transfer to 1.5B-3B models (trigger rates from ~100% to <5%), and Lee et al. (2025) showing embedding spaces remain near-isometric across 1B-70B models (Pearson r > 0.9).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Producing an analog model costs approximately $93K, or ~0.1% of a frontier model's training budget.", 301 "evidence": "Table 1 breaks down costs: $70K training, $12K distillation, $8.6K safety fine-tuning, $1.8K hosting, based on AWS on-demand pricing and the JetMoE training procedure (Shen et al. 2024).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Small model outputs as supervision for fine-tuning GPT-4 recovered over 90% of GPT-4's performance on alignment benchmarks.", 306 "evidence": "Section 2.3 cites Burns et al. (2023) on weak-to-strong label transfer from 124M GPT-2 to GPT-4 on MMLU, APPS, and TruthfulQA.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Meta's LLaMA release demonstrates that open model releases catalyze safety research without cannibalizing commercial revenue.", 311 "evidence": "Section 4.3 brown-field case study cites 15.8k Google Scholar citations, 58k GitHub stars, 7.7M HuggingFace downloads, 60+ peer-reviewed safety studies, and 'no measurable cannibalization of commercial API revenue.'", 312 "supported": "weak" 313 } 314 ], 315 "red_flags": [ 316 { 317 "flag": "Undisclosed conflict of interest", 318 "detail": "All authors work at Martian, an AI company that would benefit from frontier labs being forced to release analog models (enabling competitors and smaller companies to study frontier model behaviors). This commercial interest in the proposed regulation is never acknowledged." 319 }, 320 { 321 "flag": "Selective evidence synthesis", 322 "detail": "The technical evidence in Section 2 is drawn from a small number of favorable studies. Counter-evidence on transferability failures (e.g., emergent capabilities that don't appear at small scale) is acknowledged only briefly in Section 4.3 rather than systematically reviewed." 323 }, 324 { 325 "flag": "Claims outrun evidence", 326 "detail": "The paper claims analog models serve as reliable proxies for all frontier models, but the cited transfer experiments cover only a few model families (Qwen, Llama, GPT-2) and a limited set of safety behaviors. Generalization to arbitrary frontier architectures and emergent behaviors is assumed, not demonstrated." 327 }, 328 { 329 "flag": "Unsupported revenue claim", 330 "detail": "The claim that Meta experienced 'no measurable cannibalization of commercial API revenue' from LLaMA is presented without a source or evidence. This is central to the substitution risk argument." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "Weak-to-strong generalization: Eliciting strong capabilities with weak supervision", 336 "authors": ["Collin Burns", "Pavel Izmailov", "Jan Hendrik Kirchner"], 337 "year": 2023, 338 "relevance": "Foundational work on whether small model supervision transfers to large models — directly relevant to AI safety alignment methodology." 339 }, 340 { 341 "title": "Activation space interventions can be transferred between large language models", 342 "authors": ["Narmeen Oozeer", "Dhruv Nathawani", "Nirmalendu Prakash"], 343 "year": 2025, 344 "arxiv_id": "2503.04429", 345 "relevance": "Demonstrates cross-scale transfer of steering vectors for safety-critical behaviors — core evidence for analog model viability." 346 }, 347 { 348 "title": "The platonic representation hypothesis", 349 "authors": ["Minyoung Huh", "Brian Cheung", "Tongzhou Wang", "Phillip Isola"], 350 "year": 2024, 351 "arxiv_id": "2405.07987", 352 "relevance": "Theoretical foundation for why model representations converge across scales, supporting intervention transferability." 353 }, 354 { 355 "title": "Towards monosemanticity: Decomposing language models with dictionary learning", 356 "authors": ["Trenton Bricken", "Adly Templeton", "Joshua Batson"], 357 "year": 2023, 358 "relevance": "Sparse dictionary learning recovers shared interpretable features across model scales — supports interpretability transfer claims." 359 }, 360 { 361 "title": "Toy models of superposition", 362 "authors": ["Nelson Elhage", "Tristan Hume", "Catherine Olsson"], 363 "year": 2022, 364 "arxiv_id": "2209.10652", 365 "relevance": "Quantifies how feature representations remain stable across model scales despite superposition — mechanistic interpretability foundation." 366 }, 367 { 368 "title": "Scaling laws for neural language models", 369 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 370 "year": 2020, 371 "arxiv_id": "2001.08361", 372 "relevance": "Foundational scaling laws paper — relevant to claims about predictable capability scaling." 373 }, 374 { 375 "title": "Training compute-optimal large language models", 376 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 377 "year": 2022, 378 "arxiv_id": "2203.15556", 379 "relevance": "Chinchilla scaling laws — relevant to compute efficiency and model training methodology." 380 }, 381 { 382 "title": "Are emergent abilities of large language models a mirage?", 383 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 384 "year": 2023, 385 "arxiv_id": "2304.15004", 386 "relevance": "Challenges emergent abilities narrative — supports the paper's claim that capabilities scale smoothly." 387 }, 388 { 389 "title": "Shared global and local geometry of language model embeddings", 390 "authors": ["Andrew Lee", "Melanie Weber", "Fernanda Viégas", "Martin Wattenberg"], 391 "year": 2025, 392 "arxiv_id": "2503.21073", 393 "relevance": "Shows embedding spaces remain near-isometric across model scales — direct evidence for cross-scale intervention transfer." 394 }, 395 { 396 "title": "Sparse crosscoders for cross-layer features and model diffing", 397 "authors": ["Jack Lindsey", "Adly Templeton", "Jonathan Marcus"], 398 "year": 2024, 399 "relevance": "Interpretability tools showing representational universality across model sizes — supports analog model research utility." 400 }, 401 { 402 "title": "LLaMA: Open and efficient foundation language models", 403 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"], 404 "year": 2023, 405 "arxiv_id": "2302.13971", 406 "relevance": "The paper's primary case study for open model release impact on safety research and innovation." 407 }, 408 { 409 "title": "Constitutional AI: Harmlessness from AI Feedback", 410 "authors": ["Paul F. Christiano", "Jan Leike", "Tom B. Brown"], 411 "year": 2017, 412 "relevance": "RLHF foundational work — cited as precedent for weak-to-strong alignment approaches." 413 } 414 ] 415 }