scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22585B)
      1 {
      2   "paper": {
      3     "title": "Bayesian Reward Models for LLM Alignment",
      4     "authors": [
      5       "Adam X. Yang",
      6       "Maxime Robeyns",
      7       "Thomas Coste",
      8       "Zhengyan Shi",
      9       "Jun Wang",
     10       "Haitham Bou Ammar",
     11       "Laurence Aitchison"
     12     ],
     13     "year": 2024,
     14     "venue": "Structured Probabilistic Inference & Generative Modeling workshop, ICML 2024",
     15     "arxiv_id": "2402.13210",
     16     "doi": "10.48550/arXiv.2402.13210"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No code repository URL is provided in the paper. No GitHub link, Zenodo archive, or any other code release is mentioned."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses the publicly available AlpacaFarm dataset (Dubois et al., 2024) for both SFT and reward model training. The Pythia model suite is also publicly available."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specification, requirements.txt, Dockerfile, or detailed dependency list is provided. The paper mentions using Pythia models and LoRA but does not specify software versions or dependencies."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or reproduction scripts are provided. The experimental details in Appendix C describe hyperparameters but not how to run the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results in Figures 2-5 show only point curves without confidence intervals, error bars, or uncertainty bands. No ± notation or CI values appear in the text."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims Laplace-LoRA 'effectively mitigate[s] reward overoptimization' and that LA Ens 'demonstrated further enhancements' over ensembles, but no statistical significance tests (p-values, bootstrap tests, etc.) are reported."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Results are shown only as curves in figures (proxy/gold reward vs. KL divergence). No numerical effect sizes, percentage improvements, or baseline-contextualized differences are reported in the text."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper uses 1000 prompts and 12,500 responses per prompt for BoN sampling (Section 5) but provides no justification for why these numbers were chosen or whether they are sufficient."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread across experimental runs is reported. Results appear to be single-run without any indication of variability across seeds or initializations."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against the standard MAP (single reward model) baseline and reward model ensembles (Ens) across all experiments (Figures 2-5)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include reward model ensembles (Coste et al., 2024; Eisenstein et al., 2023) and weight averaging (Ramé et al., 2024), which are contemporary approaches from 2023-2024."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper compares variance-based vs. standard deviation-based penalties (Figures 2a vs 2b, 3a vs 3b), tests multiple values of the hyperparameter k (1, 5, 10, 20, 30), and compares single LA vs. LA combined with ensembles."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Results are evaluated using both proxy reward and gold reward scores (left and right columns of all figures), which measure different aspects of performance."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper uses a synthetic evaluation setup with a gold reward model as the ground truth. No human evaluation of the selected responses is conducted, though the system's goal is to improve alignment with human preferences."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper uses the AlpacaFarm instructions validation dataset (a separate split from the training data) for BoN evaluation (Section 5: 'we collect a subset of 1000 prompts from the AlpacaFarm instructions validation dataset')."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are shown only as aggregate curves across all prompts. No per-category or per-task breakdowns are provided (e.g., by prompt difficulty, domain, or response length)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix D discusses failure cases: larger penalty values (k=20, 30) degrade performance for standard deviation-based penalty more significantly, while variance-based penalty is more robust. The paper also shows the failure mode of MAP (overoptimization) in Figure 1."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that larger k values (20, 30) degrade performance for the standard deviation-based penalty (Appendix D), and notes the conclusion section mentions RLHF results but acknowledges they are only briefly shown."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that 'the resulting uncertainty estimates can effectively mitigate reward overoptimization in BoN sampling.' The figures show improved gold reward scores at high KL divergence for Laplace-LoRA methods, supporting this claim within the synthetic evaluation framework."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims about uncertainty penalties mitigating overoptimization. The ablation design (varying k, comparing variance vs. std penalties, single vs. ensemble) provides controlled single-variable manipulations that adequately support these claims within the experimental framework."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The abstract and conclusion make broad claims about mitigating reward overoptimization generally, but experiments use only a single synthetic setup (Pythia 70M proxy, LLaMA 7B gold, AlpacaFarm data). The title 'Bayesian Reward Models for LLM Alignment' suggests generality that is not supported by the limited experimental setting."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the penalty simply constrains optimization (a simpler mechanism than Bayesian uncertainty) or whether other forms of regularization might achieve similar effects."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies exact model architectures: Pythia 1.4B for the LLM policy, Pythia 70M for the proxy reward model, and LLaMA 7B for the gold reward model (Section 5). These are specific named models from known suites with known parameter counts."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The prompt format with special tokens (<|prompter|>, <|assistant|>, <|endoftext|>) is provided in Appendix C.1 (Table 2) with a concrete example."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Detailed hyperparameter tables are provided in Appendix C: Table 3 for SFT (learning rate, scheduler, batch size, epochs) and Table 4 for reward model training (LoRA r, alpha, dropout, weight decay, learning rate, scheduler, batch size, max sequence length)."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The approach is a standard Bayesian post-hoc technique applied to reward models."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 5 describes the data pipeline: SFT on AlpacaFarm 'sft' split (10k instruction-response pairs), response generation from the SFT policy, gold reward labeling to create proxy training data, and BoN evaluation on 1000 validation prompts with 12,500 responses each."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations or threats-to-validity section in the paper."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The paper does not address the limitations of its synthetic evaluation setup, the small model sizes used, or potential issues with the Laplace approximation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do not show. There is no discussion of which settings (model sizes, datasets, alignment methods beyond BoN) the results may not transfer to."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (generated responses, reward scores, BoN selection results) is made available for independent verification. Only aggregate curves are shown in figures."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data generation process is described: responses are sampled from the SFT policy, labeled by the gold reward model, and proxy reward models are trained on these synthetic labels (Section 5)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants were involved. The paper uses synthetic data generated from LLMs and public datasets (AlpacaFarm)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: pretrained Pythia → SFT on AlpacaFarm → generate responses → gold reward labeling → proxy reward model training with LoRA → Laplace-LoRA post-hoc → BoN evaluation on validation prompts (Sections 5 and Appendix C)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources are disclosed in the paper. There is no acknowledgments section mentioning grants or sponsors."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: University of Bristol (authors 1, 7), Huawei Noah's Ark Lab (authors 3, 6), and University College London (authors 4, 5)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. Two authors are from Huawei Noah's Ark Lab, which has a commercial interest in LLM alignment methods, but no funding statement is made."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It trains reward models on synthetic data and evaluates them in a synthetic BoN sampling setup. The evaluation tests the method, not model knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same as above — the paper does not evaluate pre-trained model knowledge on any benchmark. The reward models are trained on synthetic labels and evaluated in a synthetic framework."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same as above — benchmark contamination is not relevant because the paper evaluates a reward modeling method, not model knowledge on existing benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or computational overhead of the Laplace-LoRA approach is reported. The paper does not discuss how much additional computation the Laplace approximation and uncertainty estimation require compared to standard reward models."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, GPU hours, or hardware specifications are stated. The paper does not quantify the resources needed for training or evaluation."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Laplace-LoRA uncertainty estimates can effectively mitigate reward overoptimization in BoN sampling.",
    295       "evidence": "Figures 2 and 3 show that Laplace-LoRA reward models (with both variance and std-based penalties) achieve higher gold reward scores at high KL divergence compared to MAP baseline, indicating reduced overoptimization.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Combining Laplace-LoRA with reward ensembles (LA Ens) further enhances performance beyond either approach alone.",
    300       "evidence": "Figure 3 shows LA Ens outperforms both MAP and standard ensembles (Ens) on gold reward at high KL divergence values.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Variance-based penalty is more robust than standard deviation-based penalty at large k values.",
    305       "evidence": "Appendix D (Figures 4-5) shows that larger k values (20, 30) degrade performance more for std-based penalty than variance-based penalty.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Laplace-LoRA achieves the highest gold reward in RLHF without KL penalty.",
    310       "evidence": "Mentioned in the conclusion (Section 7) but no RLHF figures or detailed results are presented in the paper itself.",
    311       "supported": "weak"
    312     }
    313   ],
    314   "methodology_tags": [
    315     "benchmark-eval"
    316   ],
    317   "key_findings": "The paper proposes using Laplace-LoRA (post-hoc Laplace approximation on LoRA weights) to add uncertainty estimates to reward models for LLM alignment. In a synthetic evaluation using Pythia models and AlpacaFarm data, the uncertainty-penalized reward models mitigate reward overoptimization in best-of-n sampling compared to standard MAP reward models. The approach also combines beneficially with reward model ensembles. However, results are limited to a single synthetic setup with small models (70M proxy, 1.4B policy) and lack statistical rigor (no error bars, significance tests, or multiple runs).",
    318   "red_flags": [
    319     {
    320       "flag": "No error bars or variance across runs",
    321       "detail": "All results are presented as single curves without any uncertainty quantification. For a paper about uncertainty estimation, the absence of error bars across experimental runs is particularly concerning — the reader cannot assess whether the observed improvements are robust or within noise."
    322     },
    323     {
    324       "flag": "Purely synthetic evaluation",
    325       "detail": "The gold reward model (LLaMA 7B from AlpacaFarm) serves as both the label generator and ground-truth evaluator. Improvements in gold reward may not translate to actual human preference improvement. No human evaluation is conducted."
    326     },
    327     {
    328       "flag": "Small model sizes",
    329       "detail": "The proxy reward model is only 70M parameters and the policy is 1.4B. It is unclear whether the Laplace-LoRA approach would be as effective at scales typical of production RLHF systems (7B+ reward models, 70B+ policies)."
    330     },
    331     {
    332       "flag": "RLHF claim without supporting evidence",
    333       "detail": "The conclusion claims 'This also holds in RLHF, where it achieves the highest gold reward without the application of KL penalty' but no RLHF results are shown in the paper."
    334     },
    335     {
    336       "flag": "No limitations section",
    337       "detail": "A workshop paper proposing a new method with no discussion of limitations, threats to validity, or scope boundaries."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Scaling laws for reward model overoptimization",
    343       "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"],
    344       "year": 2023,
    345       "relevance": "Foundational work on reward overoptimization that motivates this paper's approach, directly relevant to LLM alignment safety."
    346     },
    347     {
    348       "title": "Reward model ensembles help mitigate overoptimization",
    349       "authors": ["Thomas Coste", "Usman Anwar", "Robert Kirk", "David Krueger"],
    350       "year": 2024,
    351       "relevance": "Key baseline method for mitigating reward hacking through ensembles; this paper extends the approach with Bayesian uncertainty."
    352     },
    353     {
    354       "title": "Helping or herding? Reward model ensembles mitigate but do not eliminate reward hacking",
    355       "authors": ["Jacob Eisenstein"],
    356       "year": 2023,
    357       "arxiv_id": "2312.09244",
    358       "relevance": "Examines limitations of reward model ensembles for preventing reward hacking in LLM alignment."
    359     },
    360     {
    361       "title": "Training language models to follow instructions with human feedback",
    362       "authors": ["Long Ouyang"],
    363       "year": 2022,
    364       "relevance": "Foundational RLHF paper describing the reward model + policy optimization paradigm for LLM alignment."
    365     },
    366     {
    367       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    368       "authors": ["Yuntao Bai"],
    369       "year": 2022,
    370       "arxiv_id": "2204.05862",
    371       "relevance": "Key work on RLHF for safety alignment, establishing helpful/harmless training objectives."
    372     },
    373     {
    374       "title": "Bayesian low-rank adaptation for large language models",
    375       "authors": ["Adam X Yang", "Maxime Robeyns", "Xi Wang", "Laurence Aitchison"],
    376       "year": 2024,
    377       "relevance": "Introduces Laplace-LoRA, the core technique this paper applies to reward models for uncertainty estimation."
    378     },
    379     {
    380       "title": "Uncertainty-penalized reinforcement learning from human feedback with diverse reward LoRA ensembles",
    381       "authors": ["Yuanzhao Zhai"],
    382       "year": 2024,
    383       "arxiv_id": "2401.00243",
    384       "relevance": "Proposes diverse LoRA ensembles with uncertainty penalties for RLHF, directly comparable approach to this paper."
    385     },
    386     {
    387       "title": "Improving reinforcement learning from human feedback with efficient reward model ensemble",
    388       "authors": ["Shun Zhang"],
    389       "year": 2024,
    390       "arxiv_id": "2401.16635",
    391       "relevance": "Explores parameter-efficient reward model ensembles including LoRA ensembles for mitigating reward hacking."
    392     },
    393     {
    394       "title": "WARM: On the benefits of weight averaged reward models",
    395       "authors": ["Alexandre Ramé"],
    396       "year": 2024,
    397       "arxiv_id": "2401.12187",
    398       "relevance": "Alternative approach to reward model robustness using weight averaging, relevant baseline for reward overoptimization mitigation."
    399     },
    400     {
    401       "title": "ODIN: Disentangled reward mitigates hacking in RLHF",
    402       "authors": ["Lichang Chen"],
    403       "year": 2024,
    404       "arxiv_id": "2402.07319",
    405       "relevance": "Proposes disentangling reward from response length to mitigate reward hacking, complementary approach to Bayesian uncertainty."
    406     },
    407     {
    408       "title": "AlpacaFarm: A simulation framework for methods that learn from human feedback",
    409       "authors": ["Yann Dubois"],
    410       "year": 2024,
    411       "relevance": "Provides the simulation framework and dataset used for all experiments in this paper."
    412     }
    413   ]
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs