ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24715B)


      1 {
      2   "paper": {
      3     "title": "Predicting LLM Reasoning Performance with Small Proxy Model",
      4     "authors": ["Woosung Koh", "Juyoung Suk", "Sungjun Han", "Se-Young Yun", "Jamin Shin"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.21013",
      8     "doi": "10.48550/arXiv.2509.21013"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "RBRIDGE enables small proxy models (≤1B) to predict large-model (13B-32B) reasoning performance by aligning evaluation with the pre-training objective (NLL) and the target task (using frontier model reasoning traces as gold labels with automatic token weighting). It achieves 100x+ compute savings for dataset ranking, strongest proxy-target correlation across six benchmarks, and successful zero-shot functional relationship transfer across pre-training datasets.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository URL is provided. The paper mentions open-sourcing the dataset but no link to code is given."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states 'we plan to open-sourced our dataset' (§5) and uses publicly available benchmarks (GSM8K, MATH500, ARC-C, etc.) and OLMo checkpoints. The frontier model reasoning traces are described as planned for release."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Appendix C.1 specifies hardware: 'A100 80G, H100 and H200 nodes' and '256 H100 GPUs with HBM3' for pre-training. However, no software dependencies or library versions are listed, but hardware is detailed."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper provides pseudocode (Algorithm 1 in Appendix B), the exact prompt used, and states the method is 'fully reproducible using the information provided in this paper' (§7 Reproducibility Statement). Experimental protocols reference open-source assets from Magnusson et al. (2025)."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Error bars are shown in Fig. 4 ('Error bars indicate one standard deviation'), Fig. 7a uses box-and-whisker plots showing distributional spread, and Fig. 12 shows error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are reported. Comparisons between RBRIDGE and baselines are made by comparing R² and MAE values directly without any significance test."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported contextually: e.g., '27% higher DAcc.' (§4.2(i)), '100.2× to 733.4× less FLOPs' (§4.2(i)), '74.7% NLL decline' (§3.2). R² and MAE values provide magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for sample sizes. The number of pre-training checkpoints (15 data points at 250B intervals), the choice of 25 datasets, and the number of benchmarks (6) are not justified."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "5-fold cross-validation is used for experiments (ii) and (iii), and experiment (i) uses proxy models averaged across three pre-training seeds (§4.1(i)). Standard deviation shown in Fig. 4 error bars."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Five baselines for dataset ranking (§4.1(i)) and six baselines for proxy-target relationship (§4.1(ii)) are compared, including Acc./p@1, iSFT, TED, MPCA, NLL, and Rϕ."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include contemporary methods: ScalingBench (Xiao et al., 2024), DataDecide (Magnusson et al., 2025), iSFT (Snell et al., 2024), and TED (Schaeffer et al., 2023)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Fig. 7b shows an ablation study decomposing RBRIDGE into its components: Rϕ → +RBRIDGE NLL → +Normalization, showing each contributes consistent improvement across all three experimental settings."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: R², MAE, Decision Accuracy, Kendall's Tau (Appendix D), and evaluation spans six benchmarks covering math, science, engineering, commonsense, and coding."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant — this is a computational methodology for predicting model performance via proxy metrics."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "5-fold cross-validation is used (§4.1(ii)), reporting both train R² and test MAE. Experiment (iii) uses a separate pre-training dataset D' as held-out evaluation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per benchmark (6 benchmarks in Tab. 2, Tab. 5, Tab. 6) rather than just aggregate averages."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses failure cases: one outlier in zero-shot transfer (CQA MAE=9.716 in Tab. 3), and limitations where frontier models fail to produce outputs in required format (§5.1)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative results are reported: Fig. 2 shows small models giving wrong direction slopes, Fig. 3b shows OOD gold labels providing no signal, and Tab. 1 shows ScB performing worse than Rϕ."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims — 100x compute reduction, strongest correlation across six benchmarks, zero-shot transfer — are all supported by results in Tab. 2, Fig. 6, and Tab. 3 respectively."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'alignment improves performance' are supported by ablation study (Fig. 7b) showing controlled removal of components. The paper uses controlled single-variable manipulation."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds claims to specific model sizes (1B→13B, 1B→32B), specific datasets (OLMo-Mix-1124), and acknowledges limitations on scale ('larger-scale studies across more model sizes and pre-training datasets would be ideal', §4.1(iii))."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses distributional alignment as an alternative explanation (§3.1), analyzes why existing approaches fail along two axes, and considers that frontier model imperfections could affect results (§5.1)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly frames RBRIDGE as a proxy metric for target-scale accuracy and discusses the gap between proxy and target metrics throughout (§2 Problem Setting defines the proxy-target relationship formally)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper says 'we use GPT 4o to generate Rϕ' (Appendix C.3) without specifying a version or snapshot date. Model sizes (1B, 7B, 13B, 32B) are given but these are OLMo models referenced by size, not version."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The exact prompt used to generate reasoning traces is provided in Appendix B: 'System: You are a helpful assistant that solves [task] problems. User: [question] Respond ONLY with a JSON object...'"
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Greedy decoding is specified for Rϕ generation (Appendix B). SFT hyperparameters are in Tab. 4 (learning rate, warmup ratio, batch size, epochs). Pre-training follows OLMo 2 settings."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. RBRIDGE is a metric computation, not an agent system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The alternative dataset D' composition is described (Appendix C.4): '8.5:1:0.5 ratio of English:multilingual:math/code' with specific dataset sources. Frontier model output extraction is documented (discard answer, keep reasoning trace)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.1 'Limitation and Future Direction' discusses three specific limitations of RBRIDGE."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "§5.1 discusses specific threats: frontier models don't achieve perfect accuracy on reasoning tasks, format failures in reasoning trace extraction, and the framework for practical application remains an open challenge."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper acknowledges scope limits: 'larger-scale studies across more model sizes and pre-training datasets would be ideal' (§4.1(iii)), not tested on long CoT models (Appendix C.3), and HumanEval excluded from experiment (iii) due to 0% p@1."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw experimental data (individual checkpoint results, per-example scores) are not released. Only aggregated results in tables and figures are provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection is well-described: benchmarks are standard public datasets, pre-training uses OLMo-Mix-1124 with checkpoints at 250B intervals, and the alternative dataset composition is specified in Appendix C.4."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks and pre-training datasets."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: frontier model generates reasoning traces → extract Rϕ → compute letter-level probabilities → aggregate to token weights → compute weighted NLL (Algorithm 1, Fig. 1)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. Authors are from Trillion Labs and KAIST AI, but no funding sources are disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Trillion Labs and KAIST AI, with correspondence emails provided."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed. Trillion Labs is a company (authors Han, Shin are affiliated) that could have financial interest in efficient pre-training methods."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. Authors from Trillion Labs may have financial interests related to efficient LLM training methods."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff date is stated for the OLMo models or GPT-4o used. The paper does not discuss when the pre-training data was collected."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of potential train/test overlap between OLMo pre-training data and benchmark test sets (GSM8K, MATH500, ARC-C, etc.)."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Benchmarks like ARC-C (2018), GSM8K (2021), MATH500 (2021) were published years before model training, creating contamination risk. This is not addressed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Cost is discussed: 'a small one-time cost of under $10 per benchmark' for generating Rϕ (§5), compute measured in FLOPs throughout (Fig. 6), and 'thousands of H100 hours' for additional training runs (§4.1(iii))."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Compute is quantified via FLOPs formula (6ND, §4.1(i)), hardware specified (256 H100 GPUs, Appendix C.1), and compute savings factors explicitly stated (100.2× to 733.4×)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Experiment (i) uses results 'averaged across three pre-training seeds' (§4.1(i), following Magnusson et al. 2025). 5-fold cross-validation provides variance estimates for experiments (ii-iii)."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Three pre-training seeds for experiment (i) (§4.1(i)), 5-fold cross-validation for experiments (ii-iii) (§4.1(ii))."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "The curve fitting hypothesis space is defined a priori: 'linear, quadratic, exponential, and logarithmic. This hypothesis space was defined a priori to avoid overfitting' (§4.1(ii))."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Configuration selection is transparent: 'Curve fitting selects the best function based on train R²' (§4.1(ii)), with the hypothesis space defined before experiments."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No multiple comparison correction is applied despite comparing 7 methods across 6 benchmarks in multiple experimental settings."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No discussion of author-evaluation bias. The authors implemented all baselines and their own method without acknowledging this potential bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Fig. 6 explicitly plots Decision Accuracy as a function of FLOPs, and Fig. 7a compares RBRIDGE at 1B against larger proxy models (7B, 13B) using the target metric."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the six benchmarks actually measure 'reasoning' as claimed. The paper assumes benchmarks like ARC-C and CQA measure reasoning without questioning construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. RBRIDGE is a metric computation method, not an agent or scaffold."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Benchmarks like GSM8K (2021) and MATH500 (2021) predate the OLMo training data, and this is not addressed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of feature leakage. The 5-shot CoT evaluation setup provides exemplars that could leak information, but this is not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether benchmark examples share structural similarities with pre-training data or with each other."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "RBRIDGE reduces dataset ranking compute cost by over 100× relative to the best baseline.",
    365       "evidence": "Fig. 6b shows RBRIDGE achieves equivalent Decision Accuracy with 100.2× to 733.4× fewer FLOPs across 25 pre-training datasets (§4.2(i)).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "RBRIDGE achieves the strongest correlation across six reasoning benchmarks at 1B to 32B scale.",
    370       "evidence": "Tab. 2 shows RBRIDGE achieves best average train R² (0.874) and test MAE (1.384) for 1B→13B, and best averages for 1B→13B+SFT and 1B→32B across all methods.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "RBRIDGE outperforms proxy models 7-13× larger using the target metric.",
    375       "evidence": "Fig. 7a shows RBRIDGE at 1B achieves lower test MAE than Acc./p@1 at 7B and 13B scale for predicting 32B performance.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "RBRIDGE enables zero-shot functional relationship transfer across pre-training datasets.",
    380       "evidence": "Tab. 3 shows transferred function achieves 5/5 correct rankings and MAE of 0.043-1.417 on most benchmarks (one outlier at 9.716), at 1B→7B scale.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Reasoning trace Rϕ is more in-distribution than benchmark-provided gold labels.",
    385       "evidence": "Fig. 4 shows 74.7% average NLL decline when using Rϕ across five reasoning benchmarks, and Tab. 1 shows better proxy performance with Rϕ.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No contamination analysis",
    392       "detail": "The paper evaluates OLMo models on public benchmarks (GSM8K, MATH500, ARC-C, HumanEval) without any discussion of data contamination, despite these benchmarks being publicly available before model training."
    393     },
    394     {
    395       "flag": "Limited zero-shot transfer evaluation",
    396       "detail": "The zero-shot transfer result (experiment iii) is demonstrated on only one additional dataset at 1B→7B scale with a single data point (1T tokens), making the claim of 'zero-shot transfer' rest on very limited evidence."
    397     },
    398     {
    399       "flag": "No significance testing",
    400       "detail": "All comparisons between RBRIDGE and baselines rely on point comparisons of R² and MAE values without any statistical significance tests, despite running multiple comparisons across methods and benchmarks."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Scaling laws for neural language models",
    406       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    407       "year": 2020,
    408       "arxiv_id": "2001.08361",
    409       "relevance": "Foundational work on scaling laws for predicting LLM performance, which RBRIDGE extends to reasoning tasks."
    410     },
    411     {
    412       "title": "Emergent abilities of large language models",
    413       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    414       "year": 2022,
    415       "relevance": "Defines the emergence phenomenon that RBRIDGE aims to bridge — reasoning capabilities appearing only at larger model scales."
    416     },
    417     {
    418       "title": "Are emergent abilities of large language models a mirage?",
    419       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    420       "year": 2023,
    421       "relevance": "Proposes continuous metrics (TED, MPCA) as baselines, arguing emergence is a metric artifact. RBRIDGE compares against these."
    422     },
    423     {
    424       "title": "DataDecide: How to predict best pretraining data with small experiments",
    425       "authors": ["Ian Magnusson", "Nguyen Tai", "Ben Bogin"],
    426       "year": 2025,
    427       "relevance": "Provides the dataset ranking benchmark and protocol used in RBRIDGE's experiment (i)."
    428     },
    429     {
    430       "title": "Predicting emergent capabilities by finetuning",
    431       "authors": ["Charlie Victor Snell", "Eric Wallace", "Dan Klein"],
    432       "year": 2024,
    433       "relevance": "Proposes intermediate SFT to predict emergent capabilities, used as a baseline (iSFT) in RBRIDGE experiments."
    434     },
    435     {
    436       "title": "An empirical analysis of compute-optimal large language model training",
    437       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    438       "year": 2022,
    439       "relevance": "Chinchilla scaling laws for compute-optimal training — foundational work RBRIDGE builds upon for cost-efficient pre-training."
    440     },
    441     {
    442       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    443       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    444       "year": 2022,
    445       "arxiv_id": "2201.11903",
    446       "relevance": "Introduces chain-of-thought reasoning traces used as gold labels in RBRIDGE."
    447     },
    448     {
    449       "title": "Evaluating large language models trained on code",
    450       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    451       "year": 2021,
    452       "arxiv_id": "2107.03374",
    453       "relevance": "HumanEval benchmark used in RBRIDGE experiments for evaluating code generation capability."
    454     },
    455     {
    456       "title": "Understanding emergent abilities of language models from the loss perspective",
    457       "authors": ["Zhengxiao Du", "Aohan Zeng", "Yuxiao Dong"],
    458       "year": 2024,
    459       "relevance": "Studies emergence at granular scale (300M-3B), directly motivating RBRIDGE's approach to bridging scale gaps."
    460     },
    461     {
    462       "title": "DoReMi: Optimizing data mixtures speeds up language model pretraining",
    463       "authors": ["Sang Michael Xie", "Hieu Pham", "Xuanyi Dong"],
    464       "year": 2023,
    465       "relevance": "Uses proxy models for data mixture optimization — related approach that RBRIDGE improves upon for reasoning tasks."
    466     }
    467   ]
    468 }

Impressum · Datenschutz