scan.json (23685B)
1 { 2 "paper": { 3 "title": "Optimal Attention Temperature Enhances In-Context Learning under Distribution Shift", 4 "authors": ["Samet Demir", "Zafer Doğan"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2511.01292", 8 "doi": "10.48550/arXiv.2511.01292" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor"], 12 "methodology_tags": ["theoretical", "benchmark-eval"], 13 "key_findings": "The paper derives closed-form optimal attention temperature for linearized softmax Transformers doing in-context learning under distribution shift. The optimal temperature depends on the nature of the shift (input covariance, noise level). Experiments on synthetic linear regression tasks validate theory, and LLaMA-2-7B experiments on SCIQ show that temperature tuning improves ICL robustness under noisy label distribution shifts. Temperature adjustment can fully compensate for simple shifts (e.g., scaled covariance) and partially mitigate more complex ones.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The reproducibility statement says 'the code for the experimental results will be released with the camera-ready version of this work.' Future release does not count." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Experiments use synthetic data (generated from described distributions) and the publicly available SCIQ dataset (Welbl et al., 2017). No proprietary data." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "GPU types are mentioned (NVIDIA Tesla V100 for GPT-2, NVIDIA A40 for LLaMA) but no requirements.txt, library versions, or environment setup details are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "Settings are described in captions and appendices, but no step-by-step reproduction instructions or scripts are provided. Code is not yet released." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Figure 3 (LLaMA-2 experiments) includes error bars showing one standard deviation, as stated: 'Results (averaged over 12 Monte Carlo runs) include error bars showing one standard deviation.'" 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims optimal temperature 'improves' and 'enhances' ICL performance but no statistical significance tests are reported — comparisons are visual from figures." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "Results are presented visually in figures. No quantitative effect sizes (e.g., percentage improvement, Cohen's d) are reported in tables or text for the experimental comparisons." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "12 Monte Carlo runs for LLaMA experiments and 5000 pretraining tasks for synthetic experiments are stated but not justified. No power analysis or explanation of why these numbers are sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Figure 3 reports results 'averaged over 12 Monte Carlo runs' with 'error bars showing one standard deviation.' Synthetic experiments appear to be deterministic given the analytical setup." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The Bayes-optimal ridge estimator serves as a principled baseline throughout. Comparisons are made between default temperature (τ=1) and optimal temperature, and between linear vs. linearized attention." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The Bayes-optimal estimator is the theoretically strongest baseline for this setting. Zhang et al. (2024) and Han et al. (2024) are recent relevant references used." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically varies distribution shift types (input covariance, task distribution, noise), context length, and temperature, effectively ablating each factor's contribution (Figures 1-3)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Generalization (ICL) error is the sole metric for synthetic experiments. Exact-match score is the sole metric for LLaMA-2 SCIQ experiments. Only one metric per experiment type." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant to this theoretical/simulation study about attention temperature optimization." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The framework explicitly separates Dtrain and Dtest distributions (Section 3.5). The generalization error is defined over Dtest with task vectors not encountered during training." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by shift type (input covariance, task distribution, noise), context length ratios (l/d), and noise levels (Figures 1-3, 6)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.4 discusses that optimal temperature 'may only partially mitigate the impact' in complex scenarios. Figure 1b shows degradation under covariance shift without temperature correction." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper shows that without temperature adjustment, ICL degrades under distribution shift. It also shows temperature cannot always fully compensate — only partial mitigation in complex cases." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims closed-form error expressions (Theorem 4.6), optimal temperature existence (Theorem 4.7), and validation on GPT-2 and LLaMA2-7B — all present in the paper with supporting figures and proofs." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims ('optimal temperature improves ICL') are backed by: (1) mathematical proof that τ_optimal minimizes generalization error (Theorem 4.7, Appendix I), and (2) controlled experiments varying single factors." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and abstract claim broad applicability ('pretrained Transformers'), but theory is derived for single-layer linearized softmax attention on linear regression. LLM experiments are limited to LLaMA-2-7B on one dataset (SCIQ) with one shift type. The paper does not adequately bound claims to these settings." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the LLM experimental results. The connection between theoretical framework (linearized attention, linear regression) and LLM behavior relies on a heuristic (Appendix J) without discussing alternative factors that could explain the LLaMA-2 improvements." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses noisy labels as a 'proxy for distribution shift' in LLM experiments (Appendix K.4), acknowledging this is a hypothesis but not substantively discussing the gap between this proxy and actual pretraining distribution shift." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "GPT-2 (Radford et al., 2019) with specific architecture details (12 layers, 8 heads, d=20) and LLaMA2-7B (Touvron et al., 2023) are specified. These are specific model sizes/versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Table 1 provides a concrete example of the ICL prompt format used for SCIQ experiments, including the demonstration structure and test example format." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Key hyperparameters reported: d=50, m=5000, σ=0.1, pretraining distributions for synthetic experiments. Temperature scaling τ√dk for LLMs. Section 5 and captions detail settings." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper modifies attention temperature directly." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix K.3-K.4 describes how SCIQ demonstrations are selected (TopK retrieval), how noisy labels are constructed (relevant but incorrect answers), and the noisy ratio parameter." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section. The reproducibility statement, ethics statement, and LLM usage statement are present but none discuss limitations of the work itself." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats-to-validity discussion. The paper does not address specific threats like the gap between linearized and full softmax attention, or the narrow empirical validation on a single LLM and dataset." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. Remark 4.3 briefly notes the pretrained parameters 'are not guaranteed to be optimal in all settings' but no systematic scope bounding is provided." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (run outputs, individual trial results) are available. Only aggregate figures are shown." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Synthetic data generation is fully specified via equations (Section 3.1, Eq. 2). SCIQ dataset usage and demonstration selection are described in Appendix K.3." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are synthetic generation and a standard public benchmark (SCIQ)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from data generation to evaluation is documented: synthetic data via specified distributions, SCIQ with TopK retrieval for demonstrations, noisy label injection per Gao et al. (2024)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgments section discloses TÜBİTAK project 124E063, KUIS AI Center fellowship, and TÜBİTAK BİDEB 2211 PhD scholarship." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Authors are from MLIP Research Group, KUIS AI Center, and Department of EEE at Koç University. No products being evaluated belong to their institution." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "TÜBİTAK (Turkish national research council) and KUIS AI Center are academic funders with no apparent financial stake in the outcomes of attention temperature research." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses LLaMA-2-7B on SCIQ but does not state the model's training data cutoff date." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether SCIQ questions appeared in LLaMA-2's training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "SCIQ was published in 2017, well before LLaMA-2's training. The paper does not discuss whether these questions could be in the training set." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "Wall-clock time is mentioned vaguely ('a single Monte Carlo run per plot in Figure 3 takes a few hours', GPT-2 '~10 minutes') but no per-example cost or total compute cost is reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "GPU types are mentioned (V100, A40) but total GPU hours or compute budget are not quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "LLaMA-2 experiments are 'averaged over 12 Monte Carlo runs' with std dev error bars shown (Figure 3). Synthetic experiments are effectively averaged over tasks." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "12 Monte Carlo runs explicitly stated for LLaMA-2 experiments. m=5000 tasks for synthetic pretraining." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The optimal temperature is derived theoretically, but for the LLM experiments, the temperature is searched empirically (Figure 3 shows performance across temperatures) without stating the search budget." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The optimal temperature is derived from theory (Theorem 4.7) and the heuristic for LLMs is based on pre-softmax score statistics (Appendix J). Selection is principled, not cherry-picked." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper compares its proposed temperature-adjusted model against default temperature without acknowledging self-comparison bias. No independent evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "Temperature adjustment is a zero-cost inference-time modification. Compute differences between configurations are negligible." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Appendix K.4 discusses why noisy labels serve as a proxy for distribution shift, with a perplexity-based argument. The paper also discusses the relationship between linearized and standard softmax in Appendix D." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Temperature modification is applied directly to the attention mechanism." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "SCIQ (2017) predates LLaMA-2 training. No discussion of whether the model memorized answers." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the ICL prompt format leaks information that would not be available in a realistic deployment." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between SCIQ examples used in demonstrations and test examples." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "An optimal attention temperature exists that minimizes generalization error for in-context learning under distribution shift.", 365 "evidence": "Theorem 4.7 provides a closed-form expression for τ_optimal (Eq. 15), with proof in Appendix I showing it minimizes the quadratic generalization error expression.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Applying optimal temperature can fully compensate for simple distribution shifts (e.g., scaled input covariance).", 370 "evidence": "Section 4.4 states that for w~N(0,I), σ=0, and shift from N(0,I) to N(0,cI), τ_optimal=c fully counteracts the shift. Figure 1b validates empirically.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Temperature adjustment improves ICL robustness in LLaMA-2-7B on SCIQ under noisy label distribution shift.", 375 "evidence": "Figure 3 shows improved exact-match scores at non-default temperatures across varying noise ratios and context lengths, averaged over 12 runs with error bars.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Linearized softmax attention is more robust to input mean shifts than linear attention.", 380 "evidence": "Figure 4 and Remark 3.4 show linearized attention recovers Bayes-optimal performance under mean shift while linear attention fails.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "The theoretical framework extends prior work by using weaker assumptions than Zhang et al. (2024).", 385 "evidence": "Assumption 3.1 allows bounded (not zero) means and general covariances, versus Zhang et al.'s more restrictive Σx=Σw=I, µw=0.", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Theory-practice gap", 392 "detail": "The theory is derived for single-layer linearized softmax attention on linear regression. The jump to multi-layer multi-head GPT-2/LLaMA-2 on NLP tasks is bridged only by an informal heuristic (Appendix J), which the authors themselves acknowledge 'should be viewed as preliminary.'" 393 }, 394 { 395 "flag": "Very narrow LLM validation", 396 "detail": "LLM experiments use only one model (LLaMA-2-7B) on one dataset (SCIQ) with one type of shift (noisy labels). The broad claims about 'pretrained Transformers' and 'actionable guidance' are not supported by this narrow empirical base." 397 }, 398 { 399 "flag": "No limitations section", 400 "detail": "Despite significant assumptions (linearized attention, linear regression, high-dimensional regime), the paper has no limitations or threats-to-validity discussion." 401 }, 402 { 403 "flag": "Contamination risk unaddressed", 404 "detail": "SCIQ (2017) likely appears in LLaMA-2's training data. If the model has memorized answers, temperature adjustments may be exploiting memorization rather than improving genuine ICL." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "Language models are few-shot learners", 410 "authors": ["Tom Brown"], 411 "year": 2020, 412 "relevance": "Foundational work on in-context learning in large language models." 413 }, 414 { 415 "title": "What can transformers learn in-context? A case study of simple function classes", 416 "authors": ["Shivam Garg", "Dimitris Tsipras", "Percy S Liang", "Gregory Valiant"], 417 "year": 2022, 418 "relevance": "Established the linear regression testbed for studying ICL in Transformers, used as experimental foundation in this paper." 419 }, 420 { 421 "title": "Trained transformers learn linear models in-context", 422 "authors": ["Ruiqi Zhang", "Spencer Frei", "Peter L. Bartlett"], 423 "year": 2024, 424 "relevance": "Direct predecessor: showed linear Transformers approximate Bayes-optimal inference in ICL under distribution shift." 425 }, 426 { 427 "title": "Attention is all you need", 428 "authors": ["Ashish Vaswani"], 429 "year": 2017, 430 "relevance": "Introduced the Transformer architecture and the standard attention temperature τ=√dk." 431 }, 432 { 433 "title": "Emergent abilities of large language models", 434 "authors": ["Jason Wei"], 435 "year": 2022, 436 "relevance": "Key reference on how model scale shapes ICL performance and emergent capabilities." 437 }, 438 { 439 "title": "On the noise robustness of in-context learning for text generation", 440 "authors": ["Hongfu Gao"], 441 "year": 2024, 442 "relevance": "Provides the noisy-label ICL experimental framework used for LLaMA-2 validation in this paper." 443 }, 444 { 445 "title": "Bridging the divide: Reconsidering softmax and linear attention", 446 "authors": ["Dongchen Han"], 447 "year": 2024, 448 "relevance": "Showed linearized softmax can approximate standard softmax, providing the theoretical basis for the attention model used." 449 }, 450 { 451 "title": "Softmax is not enough (for sharp size generalisation)", 452 "authors": ["Petar Veličković"], 453 "year": 2025, 454 "relevance": "Recent work on adaptive attention temperature schemes, directly related to this paper's focus." 455 } 456 ] 457 }