scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34062B)
      1 {
      2   "paper": {
      3     "title": "RMB: Comprehensively Benchmarking Reward Models in LLM Alignment",
      4     "authors": [
      5       "Enyu Zhou",
      6       "Guodong Zheng",
      7       "Binghai Wang",
      8       "Zhiheng Xi",
      9       "Shihan Dou",
     10       "Rong Bao",
     11       "Wei Shen",
     12       "Limao Xiong",
     13       "Jessica Fan",
     14       "Yurong Mou",
     15       "Rui Zheng",
     16       "Tao Gui",
     17       "Qi Zhang",
     18       "Xuanjing Huang"
     19     ],
     20     "year": 2024,
     21     "venue": "International Conference on Learning Representations (ICLR 2025)",
     22     "arxiv_id": "2410.09893",
     23     "doi": "10.48550/arXiv.2410.09893"
     24   },
     25   "scan_version": 3,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract states: 'Our evaluation code and datasets are available at https://github.com/Zhou-Zoey/RMB-Reward-Model-Benchmark.' A concrete GitHub URL is provided."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The abstract states code and datasets are available at the GitHub URL. The reproducibility statement confirms: 'We will release the code and datasets upon acceptance.' Published at ICLR 2025, so the data should be released."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not mention requirements.txt, Dockerfile, or detailed environment setup. Appendix C references evaluation settings but not library versions or dependency specifications."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The reproducibility statement says 'We provide details to reproduce our results in Section 4, Section 5, and Appendix C,' but these describe methodology, not step-by-step instructions with commands. No README or scripts are described in the paper."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The main results in Table 3 and all appendix tables (Tables 13-16) report only point estimates with no confidence intervals, error bars, or ± notation."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Spearman rank correlation coefficients are computed with p-values reported. Figure 22 shows 'P = 7.22e-9' and 'P = 3.09e-3' for the BoN-pairwise correlation analysis. Section 5 uses Spearman correlation for validation."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Effect sizes are provided in context: 'average decline of 17.5%' for BoN vs pairwise (Section 4.2), correlation of -0.57 between helpfulness/harmlessness rankings (Section 4.2), specific average differences in Table 7, and standard deviations in Table 8."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The benchmark has 18,000+ preference pairs and 3,197 prompts but no justification for why these sizes are sufficient. No power analysis or explicit sample size rationale is provided."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "Standard deviations of model scores across models are reported in Table 8, but variance across experimental runs is not reported. For generative RMs using stochastic outputs, no multi-run variance is stated for the main evaluation results."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "RewardBench (Lambert et al., 2024) is used as a baseline throughout. Figure 1 and Figure 4 directly compare RMB's correlation with alignment performance against RewardBench's."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RewardBench (2024) is the 'only existing reward benchmark' and is the most contemporary comparison available. Other datasets compared in Table 1 (Ultrafeedback, Helpsteer2, HH-RLHF) are also recent."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 6 explores impact factors of generative RMs by varying evaluation criteria (original vs verbose) and instructing methods (with/without CoT) in Table 12. They also test majority voting as a confidence-weighting mechanism."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Two primary metrics are used: Pairwise Accuracy (Eq. 1) and BoN Accuracy (Eq. 2). Spearman rank correlation with downstream alignment is used for validation. Results are broken down by helpfulness and harmlessness."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Three independent human annotators compared 200 randomly sampled pairs from the dataset and from RewardBench (Table 2). A separate human-annotated held-out set was used to validate the AI feedback algorithm (Table 5, Appendix B.4)."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The RMB benchmark is an evaluation set not used for training any of the models being evaluated. A 'separate human-annotated held-out set' was used to validate the scoring process (Section 3.3). The BoN verification uses external benchmarks (MixEval, Arena-Hard, AdvBench)."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Extensive per-category breakdowns are provided: Figure 3 shows per-task performance for helpfulness (12 tasks) and harmlessness (12 scenarios). Appendix H provides full tables (Tables 13-16) for all models across all categories. Figures 27-28 show ranking correlations across tasks."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4.3 discusses where models struggle: 'models tend to perform well in Nonviolent Crime and Specialized Advice scenarios' but 'there is substantial variability in sex-related scenarios, where some models achieve over 70% accuracy, while others fall below random chance levels.' Privacy and Intellectual Property weaknesses are highlighted."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 6 reports that majority voting 'may not be effective in the RM evaluation context' (Figure 24). Table 12 shows that 'using more complex helpfulness evaluation criteria tended to reduce performance.' Chain-of-Thought 'did not significantly impact larger models.'"
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Abstract claims are supported: 49 scenarios (verified in Appendix A.2), positive correlation with downstream alignment (Section 5, Figure 4), generalization defects revealed (Section 4.3, Figure 3), potential of generative RMs (Table 3). All empirical claims in the abstract have corresponding results."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Most claims are correlational (Spearman rank correlation for validation). The ablation in Table 12 varies one factor at a time (criteria complexity, CoT) which is adequate for the causal claims made ('using more complex helpfulness evaluation criteria tended to reduce performance'). Language is appropriately hedged."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Claims are generally bounded to the tested setting. The title says 'Comprehensively Benchmarking' but the paper clarifies this refers to the HH (helpfulness-harmlessness) alignment goal. Section 5 bounds correlation claims to specific model-benchmark combinations. The limitations section acknowledges scope constraints."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix D discusses alternative explanations: intrinsic conflict between objectives (D.2), pitfalls of reward hacking as explanation for trade-off (D.2), and why generative RMs might outperform (D.1 discusses harnessing core generative abilities, alignment with human judgment process, and explicit reasoning). Section 6 considers why majority voting might fail."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper explicitly distinguishes between benchmark accuracy (the proxy) and actual alignment performance (the outcome). Section 5 is entirely devoted to verifying 'Can evaluation results reflect alignment performance?' They use BoN sampling on external benchmarks to test this proxy-outcome relationship."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Most models are specified with versions: 'GPT-4-turbo-2024-04-09' (footnote 2), 'GPT-4o-2024-05-13' (Table 3), 'Claude-3-5-sonnet-20240620' (Table 4), 'Llama-2-70b-chat-hf', 'Mistral-7B-Instruct-v0.1', 'Qwen2-72B-Instruct'. Specific model IDs are given for all evaluated models."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Full prompt text is provided: Figure 15 (two-stage categorization), Figure 16 (helpfulness scoring), Figure 17 (harmlessness scoring), Figure 18 (helpfulness assessment for generative RMs), Figure 19 (harmlessness assessment). Additional prompts in Figures 25-26."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Temperature 0.2 for response generation (Appendix B.2), temperature 1.0 for BoN sampling responses (footnote 5), m=5 for BoN sampling (Section 5.1). Default chat templates used. Appendix C references evaluation settings."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. The evaluation involves direct scoring by discriminative RMs or pairwise comparison by generative RMs, not multi-step agent workflows."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 3.2 and Appendix B.1 describe the full pipeline: pre-filtering by length and language, three-way cross-validation categorization, difficulty post-filtering (removing prompts where weak models performed well). Score gap thresholds (< 2.5) and diversity balancing are documented."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "A dedicated 'LIMITATIONS' section appears after the conclusion, with substantive discussion of four specific limitations including the use of BoN instead of full RL, lack of generative RM validation, insufficient exploration of majority voting, and LLM response diversity concerns."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The limitations are specific to this study: (1) BoN used instead of PPO due to RL's 'time-consuming and unstable nature,' (2) generative RM BoN validation omitted because 'AI feedback could lead to bias accumulation,' (3) majority voting not explored in depth with specific hypothesis about filtering effect, (4) LLM response diversity concern with mitigation strategy stated."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The limitations state what was NOT done: 'We did not use a full reinforcement learning process,' 'We did not include validation for generative RMs,' 'we did not explore this in depth.' They also note 'The benchmarks reliance on LLM-generated responses may limit diversity and future adaptability.'"
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The abstract states 'Our evaluation code and datasets are available at https://github.com/Zhou-Zoey/RMB-Reward-Model-Benchmark.' The full dataset with preference pairs and BoN sets should be available."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 3 details the full data construction process: prompts from WildChat corpus (Section 3.2), response generation from 14 specific LLMs (Table 4, Appendix B.2), GPT-4-based scoring with scenario-specific key features (Section 3.3, Figures 16-17), and preference pair construction with score gap constraints."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "For human annotators: 'The annotators are all undergraduate and graduate students. We paid them a certain wage based on the number of annotations completed' (Appendix B.4). Prompt sources are described (WildChat for help-seeking, red-teaming prompts for harmlessness)."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The full pipeline is documented: prompt selection with pre-filtering and categorization → response generation from 14 LLMs → two-stage GPT-4 scoring → preference pair construction with score gap thresholds → BoN set construction. Statistics are provided at each stage (Figures 5-6, Appendix A)."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The acknowledgment section lists multiple funding sources: PCL Grant PCL2024A06, NSFC grants (62476061, 62206057, 62076069), Shanghai Rising-Star Program (23QA1400200), Natural Science Foundation of Shanghai (23ZR1403500), and Shanghai Academic Research Leader grant 22XD1401100."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations are clearly listed: Fudan University (School of Computer Science and Institute of Modern Languages), UNC Chapel Hill, and Pengcheng Laboratory. They evaluate third-party models, not their own."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Funders are Chinese government grants and academic programs (NSFC, Shanghai municipal programs, PCL). None have a financial interest in which reward model performs best on the benchmark."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is included. The ethics statement addresses other concerns but does not declare whether any authors have financial interests related to the findings."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper evaluates reward models' performance on their benchmark but does not state training data cutoff dates for the evaluated models (GPT-4o, Claude-3.5-sonnet, Starling-RM-34B, etc.). This matters because WildChat is a public corpus some models may have trained on."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether any evaluated reward models were trained on data overlapping with WildChat prompts or the specific response models used. The discriminative RMs in particular are trained on preference data that could share distribution with RMB."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "While the benchmark is newly constructed (reducing contamination risk), the paper does not explicitly discuss whether the source data (WildChat) or similar preference pairs could have appeared in training data of the evaluated models."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "The paper evaluates reward models, not human participants. Human annotators are used for data validation but are not research subjects."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study. The ethics statement says 'This work does not involve... research with human subjects.' Annotators provided data labels, not studied as participants."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study. Annotators are described as 'undergraduate and graduate students' but are not research participants."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human subjects study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human subjects study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human subjects study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The benchmark requires extensive API calls (GPT-4 for scoring, GPT-4o and Claude-3.5-sonnet for evaluation, multiple models for response generation and BoN sampling) but no inference costs are reported."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No total computational budget, GPU hours, or API spend is reported despite the extensive use of 14 LLMs for response generation, GPT-4 for scoring, and multiple reward model evaluations."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Generative RM evaluations are stochastic but results are not reported across multiple random seeds. The majority voting experiment (10 iterations) is separate from the main evaluation. Discriminative RM evaluations are deterministic."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The main evaluation does not state the number of runs. The majority voting experiment specifies '10 iterations of voting on two 70B-level LLMs' but this is a separate analysis, not the main evaluation protocol."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No hyperparameter search budget is reported. The evaluation prompts and settings appear chosen without describing how many configurations were tried or how the final setup was selected."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The evaluation configuration (prompts, scoring criteria, score gap threshold of 2.5) is used without explicit justification for why these specific values were chosen over alternatives. The human-annotated held-out set was used for 'refining' but the selection process is not detailed."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Many comparisons are made across 19 models, multiple scenarios, and multiple metrics. The correlation analyses in Figure 4 compare across multiple benchmarks and sampling models. No multiple comparison correction (Bonferroni, etc.) is applied."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors constructed RMB and then evaluate it against RewardBench. They do not acknowledge that as benchmark creators, they may have optimized RMB's design to show favorable correlation with alignment performance. No independent evaluation or bias discussion is provided."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Models ranging from 7B to 72B parameters and API models are compared without discussing compute requirements. An 8B discriminative RM is compared against GPT-4o without noting the massive compute difference."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "Section 5 is entirely devoted to construct validity: verifying that RMB scores correlate with downstream alignment task performance via BoN sampling on MixEval, Arena-Hard, and AdvBench. This directly addresses whether the benchmark measures what it claims."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "No scaffolding is involved in reward model evaluation. Models are evaluated directly on preference pairs and BoN lists."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "WildChat data is public and could have been in training data of the evaluated models. The paper does not discuss temporal ordering between benchmark construction and model training. Some reward models may have been trained on similar preference data from the same source models."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the evaluation setup leaks information. For generative RMs seeing both responses simultaneously, position bias is acknowledged but feature leakage through the evaluation format is not discussed."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether discriminative RMs were trained on preference data sharing distribution with RMB's construction process (e.g., GPT-4-scored preference pairs from similar prompt distributions)."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline is used."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "RMB demonstrates a stronger positive correlation between evaluation results and RMs' Best-of-N alignment performance compared to RewardBench across multiple external benchmarks.",
    378       "evidence": "Figure 1 and Figure 4 show Spearman rank correlations between RMB/RewardBench rankings and BoN performance on MixEval, MixEval-Hard, Arena-Hard, and AdvBench. RMB shows positive correlations while RewardBench exhibits poor or negative correlations in several settings (Section 5.2).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Generative models (GPT-4o, Claude-3.5-sonnet) show great promise in reward modeling, surpassing state-of-the-art discriminative RMs.",
    383       "evidence": "Table 3 shows GPT-4o (0.738 overall), Qwen2-72B (0.723), and Claude-3.5-sonnet (0.706) outperform the best discriminative RM Starling-RM-34B (0.712). GPT-4o tops the overall ranking, and Claude-3.5-sonnet leads on helpfulness.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "It is hard for an RM to be both competitive in judging helpfulness and harmlessness, with a -0.57 ranking correlation between the two objectives.",
    388       "evidence": "Table 6 shows the top 10 RMs' separate rankings on helpfulness and harmlessness. Spearman correlation of -0.57 is computed. Eurus-RM-7b ranks 2nd on helpfulness but 9th on harmlessness; GPT-4o ranks 7th on helpfulness but 1st on harmlessness (Section 4.2).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "BoN evaluation provides higher difficulty and greater differentiation than pairwise evaluation, with an average 17.5% performance decline.",
    393       "evidence": "Section 4.2 reports 17.5% average decline from pairwise to BoN. Table 8 shows consistently higher standard deviation for BoN (0.121 vs 0.076 helpfulness, 0.132 vs 0.069 harmlessness). Table 7 breaks down by RM type. Figure 20 visualizes the distribution.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Majority voting may not be effective for handling noise in reward model evaluation.",
    398       "evidence": "Section 6 and Figure 24 show confidence-weighted metrics do not improve correlation between evaluation results and downstream alignment performance. Despite Table 9 showing agreement increases with confidence, the weighted evaluation doesn't enhance benchmark validity.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Using more complex helpfulness evaluation criteria tends to reduce generative RM performance.",
    403       "evidence": "Table 12 shows verbose criteria hurt most models: Llama3.1-70B (-0.009), Mistral-Large (-0.049), Llama3.1-8B (-0.045), Llama2-70B (-0.081), Qwen2-72B (-0.039). Only Mixtral-8x7B shows marginal improvement (+0.003).",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Top RMs show consistent performance across helpfulness scenarios but struggle with diverse harmlessness scenarios.",
    408       "evidence": "Figure 27 shows helpfulness task correlations mostly >0.85. Figure 28 shows harmlessness correlations with much more variation (e.g., Specialized Advice vs Sexual Content correlation 0.28). Figure 3 confirms visually. Section 4.3 discusses.",
    409       "supported": "strong"
    410     }
    411   ],
    412   "methodology_tags": ["benchmark-eval"],
    413   "key_findings": "RMB is a comprehensive reward model benchmark covering 49 real-world scenarios with both pairwise and Best-of-N evaluation paradigms. The benchmark shows stronger correlation with downstream alignment performance than RewardBench, with generative models (GPT-4o, Claude-3.5-sonnet) outperforming discriminative RMs. A trade-off exists between helpfulness and harmlessness evaluation capability (rank correlation -0.57), and BoN evaluation proves more challenging and discriminative than pairwise testing. Majority voting for confidence weighting did not improve benchmark validity.",
    414   "red_flags": [
    415     {
    416       "flag": "No error bars on main results",
    417       "detail": "Table 3 and all evaluation tables report single point estimates without confidence intervals, error bars, or multi-run variance. For generative RMs using stochastic decoding, results could vary across runs, but no uncertainty quantification is provided."
    418     },
    419     {
    420       "flag": "Single AI judge for all scoring",
    421       "detail": "All preference pairs are scored by GPT-4-turbo-2024-04-09 as the sole judge. While human validation shows ~75% agreement, systematic biases in GPT-4's preferences (e.g., favoring verbose responses, certain response styles) could propagate through the entire benchmark. The ~25% disagreement rate means a substantial fraction of labels may not reflect true human preference."
    422     },
    423     {
    424       "flag": "Contamination risk unaddressed",
    425       "detail": "WildChat is a publicly available corpus. Reward models trained after its release may have seen similar prompts or response patterns in their training data. No contamination analysis is performed despite this being a benchmark evaluating model capabilities."
    426     },
    427     {
    428       "flag": "Correlation validation uses limited model set",
    429       "detail": "The downstream alignment correlation (Section 5) is computed over a small set of reward models with only a handful of external benchmarks and sampling configurations. Small-N correlations can be unstable, and no bootstrap confidence intervals are provided for the Spearman coefficients."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "RewardBench: Evaluating reward models for language modeling",
    435       "authors": ["Nathan Lambert", "Valentina Pyatkin", "Jacob Morrison"],
    436       "year": 2024,
    437       "arxiv_id": "2403.13787",
    438       "relevance": "The primary baseline — the only prior reward model benchmark. RMB is positioned as a more comprehensive and alignment-correlated alternative."
    439     },
    440     {
    441       "title": "Training language models to follow instructions with human feedback",
    442       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    443       "year": 2022,
    444       "relevance": "Foundational RLHF paper defining the alignment process and the role of reward models that RMB evaluates."
    445     },
    446     {
    447       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    448       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    449       "year": 2022,
    450       "arxiv_id": "2204.05862",
    451       "relevance": "Defines the helpfulness-harmlessness alignment framework that RMB's two evaluation dimensions are based on."
    452     },
    453     {
    454       "title": "Constitutional AI: Harmlessness from AI feedback",
    455       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    456       "year": 2022,
    457       "arxiv_id": "2212.08073",
    458       "relevance": "Introduces AI feedback for alignment, which RMB uses generative models as reward models building on this approach."
    459     },
    460     {
    461       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    462       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    463       "year": 2024,
    464       "relevance": "LLM-as-a-Judge paradigm used for generative RM evaluation in RMB and for benchmark correlation analysis."
    465     },
    466     {
    467       "title": "UltraFeedback: Boosting language models with scaled AI feedback",
    468       "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"],
    469       "year": 2024,
    470       "relevance": "Major preference dataset compared against RMB in Table 1; uses similar AI feedback methodology for data construction."
    471     },
    472     {
    473       "title": "Generative verifiers: Reward modeling as next-token prediction",
    474       "authors": ["Lunjun Zhang", "Arian Hosseini", "Hritik Bansal"],
    475       "year": 2024,
    476       "arxiv_id": "2408.15240",
    477       "relevance": "Argues for generative approaches to reward modeling, a key finding confirmed by RMB's evaluation results."
    478     },
    479     {
    480       "title": "Secrets of RLHF in large language models part II: Reward modeling",
    481       "authors": ["Binghai Wang", "Rui Zheng", "Lu Chen"],
    482       "year": 2024,
    483       "arxiv_id": "2401.06080",
    484       "relevance": "Detailed analysis of reward model training dynamics and challenges, directly relevant to understanding RM quality assessment."
    485     },
    486     {
    487       "title": "Self-rewarding language models",
    488       "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang", "Kyunghyun Cho"],
    489       "year": 2024,
    490       "arxiv_id": "2401.10020",
    491       "relevance": "Proposes using LLMs as their own reward models for alignment, relevant to the generative RM evaluation paradigm in RMB."
    492     },
    493     {
    494       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    495       "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"],
    496       "year": 2023,
    497       "arxiv_id": "2312.06674",
    498       "relevance": "Provides the taxonomy of safety categories that RMB's harmlessness scenarios are based on."
    499     },
    500     {
    501       "title": "Helping or herding? Reward model ensembles mitigate but do not eliminate reward hacking",
    502       "authors": ["Jacob Eisenstein", "Chirag Nagpal", "Alekh Agarwal"],
    503       "year": 2023,
    504       "arxiv_id": "2312.09244",
    505       "relevance": "Studies reward hacking in alignment, directly relevant to RMB's findings about helpfulness-harmlessness trade-offs and reward model limitations."
    506     },
    507     {
    508       "title": "WildChat: 1M ChatGPT interaction logs in the wild",
    509       "authors": ["Wenting Zhao", "Xiang Ren", "Jack Hessel"],
    510       "year": 2024,
    511       "arxiv_id": "2405.01470",
    512       "relevance": "Primary source of real-world prompts for RMB's benchmark construction."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 2,
    518       "justification": "RM developers and alignment researchers can directly use the benchmark to evaluate reward models, with code and data released."
    519     },
    520     "surprise_contrarian": {
    521       "score": 1,
    522       "justification": "Challenges RewardBench's validity and reveals helpfulness-harmlessness trade-off, but these findings are not dramatically surprising to the alignment community."
    523     },
    524     "fear_safety": {
    525       "score": 1,
    526       "justification": "Reveals reward model weaknesses in harmlessness evaluation, with implications for alignment safety, but does not demonstrate novel attacks."
    527     },
    528     "drama_conflict": {
    529       "score": 1,
    530       "justification": "Implicitly critiques RewardBench as showing poor correlation with actual alignment performance, but framed diplomatically as complementary work."
    531     },
    532     "demo_ability": {
    533       "score": 2,
    534       "justification": "Code and datasets released on GitHub; researchers can evaluate their own reward models against the benchmark."
    535     },
    536     "brand_recognition": {
    537       "score": 1,
    538       "justification": "From Fudan University; evaluates well-known models (GPT-4o, Claude-3.5-sonnet) but the benchmark itself and authors are not widely recognized."
    539     }
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs