scan.json (24715B)
1 { 2 "paper": { 3 "title": "Predicting LLM Reasoning Performance with Small Proxy Model", 4 "authors": ["Woosung Koh", "Juyoung Suk", "Sungjun Han", "Se-Young Yun", "Jamin Shin"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2509.21013", 8 "doi": "10.48550/arXiv.2509.21013" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "RBRIDGE enables small proxy models (≤1B) to predict large-model (13B-32B) reasoning performance by aligning evaluation with the pre-training objective (NLL) and the target task (using frontier model reasoning traces as gold labels with automatic token weighting). It achieves 100x+ compute savings for dataset ranking, strongest proxy-target correlation across six benchmarks, and successful zero-shot functional relationship transfer across pre-training datasets.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository URL is provided. The paper mentions open-sourcing the dataset but no link to code is given." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states 'we plan to open-sourced our dataset' (§5) and uses publicly available benchmarks (GSM8K, MATH500, ARC-C, etc.) and OLMo checkpoints. The frontier model reasoning traces are described as planned for release." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix C.1 specifies hardware: 'A100 80G, H100 and H200 nodes' and '256 H100 GPUs with HBM3' for pre-training. However, no software dependencies or library versions are listed, but hardware is detailed." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper provides pseudocode (Algorithm 1 in Appendix B), the exact prompt used, and states the method is 'fully reproducible using the information provided in this paper' (§7 Reproducibility Statement). Experimental protocols reference open-source assets from Magnusson et al. (2025)." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Error bars are shown in Fig. 4 ('Error bars indicate one standard deviation'), Fig. 7a uses box-and-whisker plots showing distributional spread, and Fig. 12 shows error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are reported. Comparisons between RBRIDGE and baselines are made by comparing R² and MAE values directly without any significance test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported contextually: e.g., '27% higher DAcc.' (§4.2(i)), '100.2× to 733.4× less FLOPs' (§4.2(i)), '74.7% NLL decline' (§3.2). R² and MAE values provide magnitude context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is given for sample sizes. The number of pre-training checkpoints (15 data points at 250B intervals), the choice of 25 datasets, and the number of benchmarks (6) are not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "5-fold cross-validation is used for experiments (ii) and (iii), and experiment (i) uses proxy models averaged across three pre-training seeds (§4.1(i)). Standard deviation shown in Fig. 4 error bars." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Five baselines for dataset ranking (§4.1(i)) and six baselines for proxy-target relationship (§4.1(ii)) are compared, including Acc./p@1, iSFT, TED, MPCA, NLL, and Rϕ." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include contemporary methods: ScalingBench (Xiao et al., 2024), DataDecide (Magnusson et al., 2025), iSFT (Snell et al., 2024), and TED (Schaeffer et al., 2023)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Fig. 7b shows an ablation study decomposing RBRIDGE into its components: Rϕ → +RBRIDGE NLL → +Normalization, showing each contributes consistent improvement across all three experimental settings." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: R², MAE, Decision Accuracy, Kendall's Tau (Appendix D), and evaluation spans six benchmarks covering math, science, engineering, commonsense, and coding." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant — this is a computational methodology for predicting model performance via proxy metrics." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "5-fold cross-validation is used (§4.1(ii)), reporting both train R² and test MAE. Experiment (iii) uses a separate pre-training dataset D' as held-out evaluation." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per benchmark (6 benchmarks in Tab. 2, Tab. 5, Tab. 6) rather than just aggregate averages." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses failure cases: one outlier in zero-shot transfer (CQA MAE=9.716 in Tab. 3), and limitations where frontier models fail to produce outputs in required format (§5.1)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Negative results are reported: Fig. 2 shows small models giving wrong direction slopes, Fig. 3b shows OOD gold labels providing no signal, and Tab. 1 shows ScB performing worse than Rϕ." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims — 100x compute reduction, strongest correlation across six benchmarks, zero-shot transfer — are all supported by results in Tab. 2, Fig. 6, and Tab. 3 respectively." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims like 'alignment improves performance' are supported by ablation study (Fig. 7b) showing controlled removal of components. The paper uses controlled single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper bounds claims to specific model sizes (1B→13B, 1B→32B), specific datasets (OLMo-Mix-1124), and acknowledges limitations on scale ('larger-scale studies across more model sizes and pre-training datasets would be ideal', §4.1(iii))." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses distributional alignment as an alternative explanation (§3.1), analyzes why existing approaches fail along two axes, and considers that frontier model imperfections could affect results (§5.1)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly frames RBRIDGE as a proxy metric for target-scale accuracy and discusses the gap between proxy and target metrics throughout (§2 Problem Setting defines the proxy-target relationship formally)." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper says 'we use GPT 4o to generate Rϕ' (Appendix C.3) without specifying a version or snapshot date. Model sizes (1B, 7B, 13B, 32B) are given but these are OLMo models referenced by size, not version." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The exact prompt used to generate reasoning traces is provided in Appendix B: 'System: You are a helpful assistant that solves [task] problems. User: [question] Respond ONLY with a JSON object...'" 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Greedy decoding is specified for Rϕ generation (Appendix B). SFT hyperparameters are in Tab. 4 (learning rate, warmup ratio, batch size, epochs). Pre-training follows OLMo 2 settings." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. RBRIDGE is a metric computation, not an agent system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The alternative dataset D' composition is described (Appendix C.4): '8.5:1:0.5 ratio of English:multilingual:math/code' with specific dataset sources. Frontier model output extraction is documented (discard answer, keep reasoning trace)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5.1 'Limitation and Future Direction' discusses three specific limitations of RBRIDGE." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "§5.1 discusses specific threats: frontier models don't achieve perfect accuracy on reasoning tasks, format failures in reasoning trace extraction, and the framework for practical application remains an open challenge." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper acknowledges scope limits: 'larger-scale studies across more model sizes and pre-training datasets would be ideal' (§4.1(iii)), not tested on long CoT models (Appendix C.3), and HumanEval excluded from experiment (iii) due to 0% p@1." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Raw experimental data (individual checkpoint results, per-example scores) are not released. Only aggregated results in tables and figures are provided." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data collection is well-described: benchmarks are standard public datasets, pre-training uses OLMo-Mix-1124 with checkpoints at 250B intervals, and the alternative dataset composition is specified in Appendix C.4." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks and pre-training datasets." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: frontier model generates reasoning traces → extract Rϕ → compute letter-level probabilities → aggregate to token weights → compute weighted NLL (Algorithm 1, Fig. 1)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper. Authors are from Trillion Labs and KAIST AI, but no funding sources are disclosed." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: Trillion Labs and KAIST AI, with correspondence emails provided." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. Trillion Labs is a company (authors Han, Shin are affiliated) that could have financial interest in efficient pre-training methods." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present. Authors from Trillion Labs may have financial interests related to efficient LLM training methods." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff date is stated for the OLMo models or GPT-4o used. The paper does not discuss when the pre-training data was collected." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of potential train/test overlap between OLMo pre-training data and benchmark test sets (GSM8K, MATH500, ARC-C, etc.)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Benchmarks like ARC-C (2018), GSM8K (2021), MATH500 (2021) were published years before model training, creating contamination risk. This is not addressed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Cost is discussed: 'a small one-time cost of under $10 per benchmark' for generating Rϕ (§5), compute measured in FLOPs throughout (Fig. 6), and 'thousands of H100 hours' for additional training runs (§4.1(iii))." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Compute is quantified via FLOPs formula (6ND, §4.1(i)), hardware specified (256 H100 GPUs, Appendix C.1), and compute savings factors explicitly stated (100.2× to 733.4×)." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Experiment (i) uses results 'averaged across three pre-training seeds' (§4.1(i), following Magnusson et al. 2025). 5-fold cross-validation provides variance estimates for experiments (ii-iii)." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Three pre-training seeds for experiment (i) (§4.1(i)), 5-fold cross-validation for experiments (ii-iii) (§4.1(ii))." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": true, 306 "justification": "The curve fitting hypothesis space is defined a priori: 'linear, quadratic, exponential, and logarithmic. This hypothesis space was defined a priori to avoid overfitting' (§4.1(ii))." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Configuration selection is transparent: 'Curve fitting selects the best function based on train R²' (§4.1(ii)), with the hypothesis space defined before experiments." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No multiple comparison correction is applied despite comparing 7 methods across 6 benchmarks in multiple experimental settings." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "No discussion of author-evaluation bias. The authors implemented all baselines and their own method without acknowledging this potential bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Fig. 6 explicitly plots Decision Accuracy as a function of FLOPs, and Fig. 7a compares RBRIDGE at 1B against larger proxy models (7B, 13B) using the target metric." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the six benchmarks actually measure 'reasoning' as claimed. The paper assumes benchmarks like ARC-C and CQA measure reasoning without questioning construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. RBRIDGE is a metric computation method, not an agent or scaffold." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. Benchmarks like GSM8K (2021) and MATH500 (2021) predate the OLMo training data, and this is not addressed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of feature leakage. The 5-shot CoT evaluation setup provides exemplars that could leak information, but this is not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether benchmark examples share structural similarities with pre-training data or with each other." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "RBRIDGE reduces dataset ranking compute cost by over 100× relative to the best baseline.", 365 "evidence": "Fig. 6b shows RBRIDGE achieves equivalent Decision Accuracy with 100.2× to 733.4× fewer FLOPs across 25 pre-training datasets (§4.2(i)).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "RBRIDGE achieves the strongest correlation across six reasoning benchmarks at 1B to 32B scale.", 370 "evidence": "Tab. 2 shows RBRIDGE achieves best average train R² (0.874) and test MAE (1.384) for 1B→13B, and best averages for 1B→13B+SFT and 1B→32B across all methods.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "RBRIDGE outperforms proxy models 7-13× larger using the target metric.", 375 "evidence": "Fig. 7a shows RBRIDGE at 1B achieves lower test MAE than Acc./p@1 at 7B and 13B scale for predicting 32B performance.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "RBRIDGE enables zero-shot functional relationship transfer across pre-training datasets.", 380 "evidence": "Tab. 3 shows transferred function achieves 5/5 correct rankings and MAE of 0.043-1.417 on most benchmarks (one outlier at 9.716), at 1B→7B scale.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Reasoning trace Rϕ is more in-distribution than benchmark-provided gold labels.", 385 "evidence": "Fig. 4 shows 74.7% average NLL decline when using Rϕ across five reasoning benchmarks, and Tab. 1 shows better proxy performance with Rϕ.", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No contamination analysis", 392 "detail": "The paper evaluates OLMo models on public benchmarks (GSM8K, MATH500, ARC-C, HumanEval) without any discussion of data contamination, despite these benchmarks being publicly available before model training." 393 }, 394 { 395 "flag": "Limited zero-shot transfer evaluation", 396 "detail": "The zero-shot transfer result (experiment iii) is demonstrated on only one additional dataset at 1B→7B scale with a single data point (1T tokens), making the claim of 'zero-shot transfer' rest on very limited evidence." 397 }, 398 { 399 "flag": "No significance testing", 400 "detail": "All comparisons between RBRIDGE and baselines rely on point comparisons of R² and MAE values without any statistical significance tests, despite running multiple comparisons across methods and benchmarks." 401 } 402 ], 403 "cited_papers": [ 404 { 405 "title": "Scaling laws for neural language models", 406 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 407 "year": 2020, 408 "arxiv_id": "2001.08361", 409 "relevance": "Foundational work on scaling laws for predicting LLM performance, which RBRIDGE extends to reasoning tasks." 410 }, 411 { 412 "title": "Emergent abilities of large language models", 413 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 414 "year": 2022, 415 "relevance": "Defines the emergence phenomenon that RBRIDGE aims to bridge — reasoning capabilities appearing only at larger model scales." 416 }, 417 { 418 "title": "Are emergent abilities of large language models a mirage?", 419 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 420 "year": 2023, 421 "relevance": "Proposes continuous metrics (TED, MPCA) as baselines, arguing emergence is a metric artifact. RBRIDGE compares against these." 422 }, 423 { 424 "title": "DataDecide: How to predict best pretraining data with small experiments", 425 "authors": ["Ian Magnusson", "Nguyen Tai", "Ben Bogin"], 426 "year": 2025, 427 "relevance": "Provides the dataset ranking benchmark and protocol used in RBRIDGE's experiment (i)." 428 }, 429 { 430 "title": "Predicting emergent capabilities by finetuning", 431 "authors": ["Charlie Victor Snell", "Eric Wallace", "Dan Klein"], 432 "year": 2024, 433 "relevance": "Proposes intermediate SFT to predict emergent capabilities, used as a baseline (iSFT) in RBRIDGE experiments." 434 }, 435 { 436 "title": "An empirical analysis of compute-optimal large language model training", 437 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 438 "year": 2022, 439 "relevance": "Chinchilla scaling laws for compute-optimal training — foundational work RBRIDGE builds upon for cost-efficient pre-training." 440 }, 441 { 442 "title": "Chain-of-thought prompting elicits reasoning in large language models", 443 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 444 "year": 2022, 445 "arxiv_id": "2201.11903", 446 "relevance": "Introduces chain-of-thought reasoning traces used as gold labels in RBRIDGE." 447 }, 448 { 449 "title": "Evaluating large language models trained on code", 450 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 451 "year": 2021, 452 "arxiv_id": "2107.03374", 453 "relevance": "HumanEval benchmark used in RBRIDGE experiments for evaluating code generation capability." 454 }, 455 { 456 "title": "Understanding emergent abilities of language models from the loss perspective", 457 "authors": ["Zhengxiao Du", "Aohan Zeng", "Yuxiao Dong"], 458 "year": 2024, 459 "relevance": "Studies emergence at granular scale (300M-3B), directly motivating RBRIDGE's approach to bridging scale gaps." 460 }, 461 { 462 "title": "DoReMi: Optimizing data mixtures speeds up language model pretraining", 463 "authors": ["Sang Michael Xie", "Hieu Pham", "Xuanyi Dong"], 464 "year": 2023, 465 "relevance": "Uses proxy models for data mixture optimization — related approach that RBRIDGE improves upon for reasoning tasks." 466 } 467 ] 468 }