scan.json (25210B)
1 { 2 "paper": { 3 "title": "How Does Controllability Emerge In Language Models During Pretraining?", 4 "authors": ["Jianshu She", "Xinyue Li", "Eric Xing", "Zhengzhong Liu", "Qirong Ho"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.01892", 8 "doi": "10.48550/arXiv.2508.01892" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Linear steerability emerges during intermediate-to-late stages of pretraining, distinct from the model's ability to express concepts. Different concepts (e.g., anger vs. sadness) exhibit steerability at different training stages. The Intervention Detector (ID) framework tracks concept representations across checkpoints and shows that concepts become increasingly linearly separable as training progresses, correlating with steerability emergence. Results generalize across CrystalCoder and Amber model families.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository URL or link is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The stimulus datasets were generated by ChatGPT-4 and are not released. The supervised tasks use public benchmarks (ARC, OBQA, RACE), but the constructed stimulus pairs are not shared." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specification, requirements file, or dependency list is provided. Fine-tuning parameters are listed in Table 5 but no software environment details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but no scripts or commands are given." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Figures show results across random seeds (e.g., Figure 2 mentions 5 random seeds, Figure 6 shows 3 random seeds) but no confidence intervals or error bars are reported in the main results tables or figures." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used. Claims about steerability emergence and differences between concepts are made by visual inspection of plots, not formal tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "No formal effect sizes are reported. Results are presented as accuracy percentages and ID scores without quantified effect magnitudes." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "256 stimulus pairs are used per experiment with no justification for this number. The choice of 5 random seeds (Figure 2) and 3 random seeds (Figure 6) is not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Figure 2 reports results across 5 random seeds and Figure 6/11 show results across multiple random seeds, with visible spread in the plots." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Figure 2 compares baseline (no intervention) vs. intervened model at each checkpoint across four reasoning datasets." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper adapts contemporary intervention methods (RepE, CAA, ITI, CCS, ActAdd) summarized in Table 1, all from 2022-2024." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper tests interventions with different numbers of layers, scaling factors (Appendix E), learning rates for fine-tuning (Figure 7/16), and PCA vs K-Means decomposition (Appendix L)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: ID scores, cosine similarity, entropy, layer-wise ID differences, ChatGPT emotion intensity scores, and accuracy on four reasoning datasets." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Emotion intensity is evaluated by ChatGPT, not human judges. The paper acknowledges in Limitation (6) that concepts like emotions lack ground truth and evaluation relies on LLM-as-judge, introducing subjectivity." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 3 step 1 describes splitting stimulus data into Strain and Stest. The supervised tasks use standard benchmark test sets (ARC, OBQA, RACE)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per emotion (6 emotions in Figure 1b, 4, 5, 6) and per dataset (ARC Challenge, ARC Easy, OBQA, RACE in Figure 2, 7, 8, 9)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Figure 1(b) shows inconclusive results for some emotions at end of training. Figure 2 shows early-stage intervention reduces accuracy. Limitation section discusses cases where steerability does not clearly emerge." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Early-stage interventions reduce accuracy (Figure 2). Some emotions (surprise, disgust) show weak/inconclusive steerability. ARC Challenge/Easy only show effects at the final checkpoint." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims about emergence of linear steerability at intermediate stages and different emergence times for different concepts are supported by Figures 1, 2, 4, and related analyses." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims (e.g., 'steerability requires not only the presence of a concept but also its linear separability') but acknowledges in Figure 2 caption that 'earlier improvements may stem from pretraining itself rather than the steering effect.' The causal mechanism is inferred from correlation between ID metrics and intervention effectiveness." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims generality ('Language Models') but Limitation (1) acknowledges only two 7B-scale models were tested. The paper does not test larger models or different architectures beyond the LLaMA-based Crystal and Amber." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses alternative explanations: learning rate effects with control groups (Figure 7), whether improvements stem from pretraining vs. steering (Figure 2 caption), and whether fine-tuning rather than pretraining drives the results (Appendix D.2, E)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper clearly distinguishes between ID scores as a proxy and actual intervention effectiveness, and validates the proxy by showing correlation with downstream intervention results. It also distinguishes between the ability to express a concept and linear steerability of that concept." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific models are identified: LLM360 CrystalCoder (7B), CrystalChat, and Amber (7B, LLaMA architecture, 1.3T tokens). Checkpoint intervals are specified (every 15K steps). Architecture details in Table 2." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix A provides the full stimulus template with concrete examples. Appendix B provides supervised task stimulus format. The ChatGPT evaluation prompt is given in full in Appendix A." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table 5 reports fine-tuning hyperparameters (learning rate 2e-5, batch size 4, sequence length 512, etc.). Intervention parameters are stated: scaling factor of 40 and top 10 layers uniformly used." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper applies direct activation interventions to model hidden states." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3 describes hidden state collection, normalization (Eq. 2), PCA decomposition. Appendix A describes stimulus construction (1500 scenarios generated by ChatGPT-4, 256 randomly selected per experiment). Fine-tuning data uses 1/10 of CrystalChat data for 1 epoch." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 'Limitation' provides seven numbered limitations spanning computational constraints, coefficient tuning, linearity assumption, methodology scope, alternative methods, evaluation difficulty, and safety." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The limitations are specific: (1) only two 7B models tested, (2) intervention coefficients differ across models (~10 for Crystal, ~3 for Amber) without explanation, (6) emotion concepts lack ground truth making evaluation subjective." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Limitation (1) states they did not test larger models. Limitation (3) states nonlinear approaches were not explored. Limitation (4) states methodological innovation in intervention techniques is not the focus." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (hidden states, ID scores, stimulus sets) is made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A describes generating 1500 scenarios with ChatGPT-4, random selection of 256 pairs, split into train/test. Section 3 describes hidden state extraction at -1 token position. Supervised task construction is described in Appendix B." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks (ARC, OBQA, RACE) and synthetically generated stimuli." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Figure 3 shows the full pipeline: checkpoint selection → fine-tuning → stimulus construction → hidden state collection → PCA decomposition → ID score calculation → intervention. Each step is described in Section 3." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: MBZUAI and Carnegie Mellon University. They use open-source models (LLM360 Crystal, Amber) rather than proprietary ones." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses LLM360 Crystal and Amber models evaluated on ARC, OBQA, and RACE benchmarks but does not state the training data cutoff dates for these models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether ARC, OBQA, or RACE examples appeared in the pretraining data of Crystal or Amber models." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "ARC (2018), OBQA (2018), and RACE (2017) are public benchmarks that predate the models. No contamination analysis is performed despite this risk." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs or wall-clock times reported for the ID framework or interventions, despite the paper positioning ID as a 'cost-effective monitoring tool.'" 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget stated. The paper fine-tunes multiple checkpoints of 7B models but does not report GPU hours or hardware used." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Figure 2 reports results across 5 random seeds. Figure 6 and 11 show results across 3 random seeds. Figure 11 shows mean ID scores across different random seeds." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 4.1 states 'We ran this experiment five times with random seeds' (Figure 11). Figure 6 caption mentions '3 random seeds.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The scaling factor of 40 and top 10 layers were selected empirically but no search budget or number of configurations tried is reported. Limitation (2) acknowledges coefficients were 'empirically tuned through multiple rounds of testing.'" 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The scaling factor of 40 and top 10 layers are used uniformly but the selection process is not transparent. The paper states these were empirically chosen without showing the selection methodology." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement their own version of prior intervention methods (RepE, CAA, etc.) and compare results. No acknowledgment of potential bias from self-implementation of baselines." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No performance-vs-compute analysis despite the paper proposing ID as a cost-effective monitoring tool. The compute cost of running ID across all checkpoints is not quantified." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses ARC, OBQA, and RACE as proxies for commonsense reasoning without discussing whether these benchmarks actually measure the concept being steered. The relationship between 'commonsense reasoning representation' and benchmark performance is assumed, not validated." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; interventions are direct activation additions." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "ARC (2018), OBQA (2018), and RACE (2017) all predate the models' training. No discussion of whether these benchmark solutions appeared in the pretraining corpus." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the supervised task stimulus construction (correct vs. incorrect answers) leaks information beyond what would be available in a real setting." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between the stimulus training set and test set beyond stating they are randomly split from the same pool." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Linear steerability emerges during intermediate-to-late stages of training, separate from the model's ability to express concepts via prompting.", 365 "evidence": "Figure 1(a) shows anger steerability emerging at ~68% of training steps, while the model could express anger earlier. Figure 2 shows intervention effectiveness increasing at later checkpoints across four reasoning datasets.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Different concepts exhibit steerability emergence at distinct training stages.", 370 "evidence": "Figure 1(b) shows anger and fear emerge early while sadness, surprise, and disgust emerge later. Table 10 shows different emergence checkpoints across RACE (93%), OBQA (63%), ARC-C (99%), ARC-E (100%).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Concept representations become increasingly linearly separable as training progresses, correlating with steerability emergence.", 375 "evidence": "Figure 4 heatmaps show ID scores increasing in higher layers at later checkpoints. Figure 10 shows first PCA component increasingly dominates. Figure 5 shows cosine similarity changes correlating with steerability emergence.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The Intervention Detector framework generalizes across model families.", 380 "evidence": "Results shown for both CrystalCoder (Figures 1-8) and Amber (Figure 13, Appendix I), with similar patterns observed.", 381 "supported": "weak" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "LLM-as-judge without validation", 387 "detail": "Emotion intensity is evaluated by ChatGPT-4 with no human validation or inter-rater reliability assessment. The reference score for each emotion is calibrated to CrystalChat's output, creating a circular evaluation where the fully fine-tuned model defines the maximum score." 388 }, 389 { 390 "flag": "No statistical tests for main claims", 391 "detail": "Claims about emergence timing and concept differences are based on visual inspection of plots. No formal statistical tests establish when steerability 'emerges' or whether different concepts emerge at significantly different times." 392 }, 393 { 394 "flag": "Limited model diversity", 395 "detail": "Only two model families tested (Crystal and Amber), both 7B-scale, both LLaMA-derived architectures. Generalization claims to 'language models' broadly are not well supported." 396 }, 397 { 398 "flag": "Contamination risk unaddressed", 399 "detail": "All four evaluation benchmarks (ARC, OBQA, RACE) predate the models by several years. If benchmark solutions appeared in pretraining data, the supervised task results could reflect memorization rather than genuine reasoning steerability." 400 } 401 ], 402 "cited_papers": [ 403 { 404 "title": "Representation Engineering: A Top-Down Approach to AI Transparency", 405 "authors": ["Andy Zou", "Long Phan", "Sarah Chen"], 406 "year": 2023, 407 "arxiv_id": "2310.01405", 408 "relevance": "Foundational method for representation-level intervention in LLMs, directly adapted in this paper's ID framework." 409 }, 410 { 411 "title": "Steering Llama 2 via Contrastive Activation Addition", 412 "authors": ["Nina Panickssery", "Nick Gabrieli", "Julian Schulz"], 413 "year": 2024, 414 "arxiv_id": "2312.06681", 415 "relevance": "Contrastive activation addition method for LLM steering, one of the key methods adapted in this work." 416 }, 417 { 418 "title": "Inference-Time Intervention: Eliciting Truthful Answers from a Language Model", 419 "authors": ["Kenneth Li", "Oam Patel", "Fernanda Viégas"], 420 "year": 2023, 421 "arxiv_id": "2306.03341", 422 "relevance": "Inference-time intervention for truthfulness steering, directly compared and adapted in Table 1." 423 }, 424 { 425 "title": "Activation Addition: Steering Language Models Without Optimization", 426 "authors": ["Alexander Matt Turner", "Lisa Thiergart", "Gavin Leech"], 427 "year": 2024, 428 "arxiv_id": "2308.10248", 429 "relevance": "Activation addition steering method for sentiment and toxicity control in LLMs." 430 }, 431 { 432 "title": "Discovering Latent Knowledge in Language Models Without Supervision", 433 "authors": ["Collin Burns", "Haotian Ye", "Dan Klein", "Jacob Steinhardt"], 434 "year": 2024, 435 "arxiv_id": "2212.03827", 436 "relevance": "Contrast-Consistent Search (CCS) for unsupervised truthfulness extraction from LLM representations." 437 }, 438 { 439 "title": "Emergent Abilities of Large Language Models", 440 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 441 "year": 2022, 442 "arxiv_id": "2206.07682", 443 "relevance": "Foundational work on emergence in LLMs that motivates this paper's study of steerability emergence." 444 }, 445 { 446 "title": "Scaling and Evaluating Sparse Autoencoders", 447 "authors": ["Leo Gao", "Tom Dupré la Tour"], 448 "year": 2024, 449 "arxiv_id": "2406.04093", 450 "relevance": "Sparse autoencoder interpretability work on extracting monosemantic features from LLMs." 451 }, 452 { 453 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 454 "authors": ["DeepSeek-AI"], 455 "year": 2025, 456 "arxiv_id": "2501.12948", 457 "relevance": "Cold-start fine-tuning approach adopted in this paper's methodology for checkpoint preparation." 458 }, 459 { 460 "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling", 461 "authors": ["Stella Biderman", "Hailey Schoelkopf"], 462 "year": 2023, 463 "arxiv_id": "2304.01373", 464 "relevance": "Open-source model suite with pretraining checkpoints, relevant infrastructure for longitudinal LLM analysis." 465 }, 466 { 467 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 468 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 469 "year": 2023, 470 "arxiv_id": "2304.15004", 471 "relevance": "Challenges the emergence narrative in LLMs, directly relevant to this paper's claims about steerability emergence." 472 } 473 ] 474 }