scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27298B)
      1 {
      2   "paper": {
      3     "title": "RoBoN: Routed Online Best-of-n for Test-Time Scaling with Multiple LLMs",
      4     "authors": ["Jonathan Geuter", "Gregor Kornhardt"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025 Workshop: Foundations of Reasoning in Language Models",
      7     "arxiv_id": "2512.05542",
      8     "doi": "10.48550/arXiv.2512.05542"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "ROBON, a sequential multi-LLM best-of-n method that routes generations across models using reward and agreement signals, consistently outperforms single-model BoN on five reasoning benchmarks for larger n (n≥16), with accuracy gains up to 3.4 percentage points. The agreement-weighted scoring mechanism helps mitigate reward hacking that causes single-model BoN performance to degrade at large n. The method is training-free, maintains compute parity with standard BoN, and works with any plug-in reward model.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository provided: 'The implementation is available at https://github.com/j-geuter/RoBoN' (Section 4, Implementation)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper states 'we also provide the full dataset of all generated responses by all four models on all five datasets, with corresponding rewards and normalized rewards' (Section 4). All benchmarks used are also publicly available."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 'single H100 GPU' and vLLM but does not provide a requirements.txt, Dockerfile, or detailed library version listing in the paper text."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The NeurIPS checklist states 'The full source code including a README with instructions is provided' (Q5), and the full dataset of generated responses is released, enabling both reproduction from scratch and verification of reported numbers."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "All tables and figures report '1-sigma confidence intervals' (Section 4: 'we report the average accuracy as well as the 1-sigma confidence intervals'). Tables show ± notation throughout."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims ROBON 'significantly outperforms' baselines (Figure 1 caption, Section 4.1) but reports no statistical significance tests (no p-values, t-tests, or other tests). Differences are assessed only by comparing point estimates with confidence intervals."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Absolute accuracy differences are reported with baseline context throughout, e.g., 'accuracy gains of up to 3.4% over baselines' (Section 4.1), and tables provide exact accuracy values for all methods enabling direct comparison (e.g., 0.564 vs 0.581 at n=256, Table 1)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why these specific 5 benchmarks were chosen or power analysis for detecting differences of the reported magnitudes."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "1-sigma confidence intervals are reported in all tables and figures, capturing variability across experimental runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three baselines are compared: (a) single-model BoN for each of 4 models, (b) average accuracy across individual BoN strategies, (c) equal allocation (n/M samples per model). Described in Section 4."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines (single-model BoN, uniform portfolio) are the natural comparisons for this multi-model BoN setting. Models used (Qwen2.5, Llama-3.1, DeepSeek-Coder) are recent releases."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Appendix A.2 provides an ablation over α (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), and Appendix A.3 analyzes model selection shares. The ablation shows α=1.0 (reward-only, no agreement) performs significantly worse."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only accuracy is reported as a metric across all experiments. No secondary metrics (e.g., diversity of selected responses, reward distribution, calibration) are evaluated."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant for automated math/reasoning benchmark evaluation where ground truth answers exist."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are reported on standard benchmark test sets (MATH500, OlympiadBench, MinervaMath, GSM8K, MMLU-STEM) which are established test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Appendix A.1 (Tables 2-6) provides per-dataset results for all five benchmarks, and Appendix A.3 shows per-dataset model selection distributions."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses performance degradation at large n due to reward hacking (Section 4.1, Figure 1), and ROBON's poor performance at n=1 and n=4: 'ROBON lags behind baselines for n=1 and n=4.'"
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results reported: ROBON underperforms at small n (n=1, n=4), α=1.0 (reward-only) performs 'significantly worse' (Appendix A.2), performance degrades on some datasets at large n due to reward hacking, and the equal baseline underperforms individual BoN at small n."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims 'gains of up to 3.4% in absolute accuracy' matches MATH500 n=256 (0.804→0.838, 3.4pp, Table 2). Claims of consistent outperformance at larger n supported by Table 1 and per-dataset tables. Claim of improvement over uniform portfolio supported by equal baseline comparisons."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper claims 'diversity across models can be exploited at inference to improve best-of-n performance.' This is supported by controlled comparisons: single-model BoN vs multi-model ROBON at matched compute budgets, and ablation over α isolating the agreement term's contribution (Appendix A.2)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper explicitly bounds scope: 'ROBON in its current form is only applicable to tasks where it can be immediately verified whether two answers are identical, as is often the case on reasoning datasets' (Section 5). Future work lists extending to other domains and model suites."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for ROBON's gains. Could the improvement be from having more diverse sampling distributions rather than the routing algorithm? Could the reward normalization be doing most of the work? The ablation over α addresses one dimension but doesn't consider broader alternatives."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures accuracy on reasoning benchmarks and claims accuracy improvement. The measurements directly match the claimed outcome with no proxy gap."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model identifiers provided: 'Qwen2.5-Math-7B-Instruct, DeepSeek-Coder-6.7B-Instruct, Llama-3.1-8B-Instruct, Qwen2.5-Coder-7B-Instruct' and 'Skywork/Skywork-Reward-V2-Llama-3.1-8B' (Section 4). These are HuggingFace model card names sufficient for exact reproduction."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No prompts or prompt templates are shown in the paper. The paper does not describe how benchmark questions are formatted for each model (system prompts, chat templates, instructions for boxed answers)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters reported: 'α = 0.4', 'β = 1e5', 'temperature = 1.0 and top_p = 0.95' (Section 4, Hyperparameters)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. ROBON is a sampling and selection method, not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Reward normalization via empirical CDF is documented (Section 4, Reward Normalization), and answer extraction/normalization is described: 'extracting the solution from the response, and subsequently applying canonical normalizations (such as removing whitespaces and turning everything into lower case)' (Section 3)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5 'Discussion and Limitations' provides substantive discussion of runtime cost, applicability constraints, need for reward normalization, and future directions."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: runtime penalty from sequential generation, applicability limited to tasks with verifiable answer identity, need for pre-computed reward corpus for normalization, ROBON underperforms at small n, and difficulty of deriving theoretical guarantees."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicit scope boundaries: 'ROBON in its current form is only applicable to tasks where it can be immediately verified whether two answers are identical' (Section 5). Future work acknowledges need to 'extend ROBON to other domains' and 'verify its benefits on different suites of models.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Full raw data released: 'we also provide the full dataset of all generated responses by all four models on all five datasets, with corresponding rewards and normalized rewards' (Section 4)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data generation procedure described: specific models, vLLM implementation, temperature=1.0, top_p=0.95, single H100 GPU, standard benchmark datasets with specific splits (e.g., 'OE_TO_maths_en_COMP split')."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Pipeline documented: generate responses per model → compute rewards via reward model → normalize rewards via empirical CDF → run ROBON routing algorithm → select final output via BoN on set S. Each step is formally described in Algorithm 1."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section: 'JG is supported by a fellowship from the Kempner Institute for the Study of Natural and Artificial Intelligence at Harvard University.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations listed: Harvard University (School of Engineering, Kempner Institute) and Technische Universität Berlin (Department of Mathematics). No evaluated products are from these institutions."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Kempner Institute is an academic research institute with no financial stake in whether ROBON outperforms BoN baselines."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates stated for any of the four models (Qwen2.5, DeepSeek-Coder, Llama-3.1). These models were released in 2024 and benchmarks like GSM8K (2021) and MMLU (2021) predate them."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether benchmark problems (GSM8K, MMLU, MATH500) appeared in the training data of the models used."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GSM8K (2021), MMLU (2021), and MATH500 are well-known benchmarks that predate all four models used. No contamination analysis or discussion is provided."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper discusses compute parity in FLOPs and qualitative runtime analysis ('worst-case, ROBON suffers an additional factor of n') but explicitly declines to provide actual runtime comparisons: 'This is the reason we decided not to include an explicit runtime comparison to regular best-of-n.' No wall-clock times, token counts, or costs reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Only 'single H100 GPU' is mentioned. No total GPU hours, wall-clock time, or total compute budget is reported for the experiments."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "While 1-sigma confidence intervals are reported, the paper does not explicitly discuss sensitivity to random seeds or state that results are reported across multiple seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many experimental runs produced the reported confidence intervals. The source of variance in the confidence intervals is not explained."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper states 'We set α = 0.4' and 'we found that larger values of β work better, so we use β = 1e5' but does not report how many configurations were tried or the search method used to arrive at these values."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The ablation over α (Figure 2) shows performance on the test benchmarks themselves (MATH500, OlympiadBench, MinervaMath). No validation set is used for configuration selection — α=0.4 appears selected based on test set performance."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Comparisons are made across 5 datasets, 4 models, and multiple n values with no multiple comparison correction applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement both ROBON and all baselines without acknowledging potential bias from evaluating their own system. No independent evaluation or discussion of this bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section 4 'Compute, Memory, and Runtime' explicitly compares ROBON and BoN at matched compute budgets: 'In terms of FLOPs, ROBON is asymptotically exactly en par with standard best-of-n' and all comparisons use the same n (same total generations)."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks (MATH500, GSM8K, MMLU-STEM, etc.) actually measure the reasoning capabilities the paper claims to improve. Benchmarks are used without questioning construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved. ROBON is a scoring/routing mechanism applied identically across all models — there is no scaffold confound."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Models released in 2024 are tested on benchmarks from 2021 (GSM8K, MMLU) without addressing whether training data included benchmark solutions."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information (e.g., whether benchmark formatting provides hints)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between train and test data for the models used."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method applied (no canary strings, membership inference, decontamination, etc.)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "ROBON consistently outperforms standard best-of-n applied to each individual model for larger n, with gains of up to 3.4% in absolute accuracy.",
    365       "evidence": "Table 1 and per-dataset tables (Tables 2-6) show ROBON outperforming all single-model BoN baselines at n=64 and n=256. The 3.4pp gain corresponds to MATH500 n=256: deepseek BoN 0.804 vs ROBON 0.838 (Table 2).",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "ROBON improves over a uniform multi-model portfolio baseline (equal allocation).",
    370       "evidence": "Table 1: at n=256, equal baseline achieves 0.560 vs ROBON 0.581 (2.1pp gain). Consistent improvement shown across all n≥16 and all datasets.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "ROBON achieves accuracies that best-of-n with any single model fails to achieve for any n.",
    375       "evidence": "Figure 1 shows single-model BoN performance plateauing or declining at large n while ROBON continues to improve. On MATH500 and MinervaMath, single-model BoN degrades past n≈16 while ROBON maintains higher accuracy.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The agreement term mitigates reward hacking compared to reward-only scoring.",
    380       "evidence": "Figure 2 ablation shows α=1.0 (reward-only, no agreement) performs 'significantly worse.' Figure 1 shows ROBON's performance degrades less than single-model BoN at large n on MATH500 and MinervaMath.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "ROBON is robust to different values of α, as long as α < 1.",
    385       "evidence": "Figure 2 shows similar performance for α=0.0-0.8, with only α=1.0 showing significant degradation. The ablation covers 6 values of α across 3 datasets.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No significance tests despite 'significant' claims",
    392       "detail": "The paper uses the word 'significantly' (Figure 1 caption: 'ROBON significantly outperforms') without conducting any statistical significance tests. Differences are assessed only by comparing point estimates with overlapping confidence intervals."
    393     },
    394     {
    395       "flag": "Hyperparameter selection on test data",
    396       "detail": "The α ablation (Figure 2) evaluates different α values on the test benchmarks themselves. α=0.4 was selected as having 'a slight edge' based on test set performance, with no separate validation set used for hyperparameter selection."
    397     },
    398     {
    399       "flag": "No contamination analysis for old benchmarks",
    400       "detail": "GSM8K (2021), MMLU (2021), and MATH500 are well-known benchmarks that predate all four models used (released 2024). Model training data likely contains benchmark solutions, and this is not discussed."
    401     },
    402     {
    403       "flag": "Cherry-picked maximum gain reported",
    404       "detail": "The abstract highlights 'up to 3.4%' gain, which comes from a single dataset-baseline combination (MATH500 n=256 vs deepseek). The average gain across datasets at n=256 is 1.7pp (Table 1: 0.564→0.581), and on some datasets gains are marginal (GSM8K: 0.4pp, MMLU: 0.9pp)."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters",
    410       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    411       "year": 2024,
    412       "arxiv_id": "2408.03314",
    413       "relevance": "Foundational work on test-time compute scaling showing it can outperform model parameter scaling."
    414     },
    415     {
    416       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    417       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Saul Ehrlich"],
    418       "year": 2025,
    419       "relevance": "Studies inference compute scaling via repeated sampling, directly relevant to best-of-n approaches."
    420     },
    421     {
    422       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    423       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    424       "year": 2024,
    425       "relevance": "Proposes LLM routing for cost reduction, a key related approach to multi-model inference."
    426     },
    427     {
    428       "title": "LLM-blender: Ensembling large language models with pairwise ranking and generative fusion",
    429       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    430       "year": 2023,
    431       "relevance": "Demonstrates that ensembling multiple LLMs via learned ranking and fusion can outperform individual models."
    432     },
    433     {
    434       "title": "Mixture-of-Agents Enhances Large Language Model Capabilities",
    435       "authors": ["Junlin Wang", "Jue Wang", "Ben Athiwaratkun", "Ce Zhang", "James Zou"],
    436       "year": 2025,
    437       "relevance": "Multi-model aggregation approach that exploits diverse LLM outputs for improved performance."
    438     },
    439     {
    440       "title": "Harnessing Multiple Large Language Models: A Survey on LLM Ensemble",
    441       "authors": ["Zhijun Chen"],
    442       "year": 2025,
    443       "arxiv_id": "2502.18036",
    444       "relevance": "Survey covering LLM ensembling techniques including routing, fusion, and multi-model strategies."
    445     },
    446     {
    447       "title": "RouterEval: A Comprehensive Benchmark for Routing LLMs to Explore Model-level Scaling Up in LLMs",
    448       "authors": ["Zhongzhan Huang"],
    449       "year": 2025,
    450       "arxiv_id": "2503.10657",
    451       "relevance": "Benchmark for evaluating LLM routing strategies, directly relevant to model selection and routing."
    452     },
    453     {
    454       "title": "Large Language Model Routing with Benchmark Datasets",
    455       "authors": ["Tal Shnitzer", "Anthony Ou", "Mírian Silva"],
    456       "year": 2024,
    457       "relevance": "Explores training parametric routers to select among LLMs per input, a key baseline approach to ROBON."
    458     },
    459     {
    460       "title": "Scaling Laws for Reward Model Overoptimization",
    461       "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"],
    462       "year": 2023,
    463       "relevance": "Documents reward hacking in best-of-n sampling, a key problem ROBON's agreement term aims to mitigate."
    464     },
    465     {
    466       "title": "s1: Simple test-time scaling",
    467       "authors": ["Niklas Muennighoff"],
    468       "year": 2025,
    469       "arxiv_id": "2501.19393",
    470       "relevance": "Sequential test-time scaling via extended thinking tokens, representing an alternative approach to inference-time compute scaling."
    471     },
    472     {
    473       "title": "Theoretical Guarantees on the Best-of-n Alignment Policy",
    474       "authors": ["Ahmad Beirami"],
    475       "year": 2025,
    476       "relevance": "Provides theoretical foundations for best-of-n as an alignment strategy, directly relevant to ROBON's BoN framework."
    477     },
    478     {
    479       "title": "A Survey on Test-Time Scaling in Large Language Models: What, How, Where, and How Well?",
    480       "authors": ["Qiyuan Zhang"],
    481       "year": 2025,
    482       "arxiv_id": "2503.24235",
    483       "relevance": "Comprehensive survey on test-time scaling approaches covering the landscape ROBON contributes to."
    484     }
    485   ]
    486 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs