scan.json (30498B)
1 { 2 "paper": { 3 "title": "Improving LLM General Preference Alignment via Optimistic Online Mirror Descent", 4 "authors": [ 5 "Yuheng Zhang", 6 "Dian Yu", 7 "Tao Ge", 8 "Linfeng Song", 9 "Zhichen Zeng", 10 "Haitao Mi", 11 "Nan Jiang", 12 "Dong Yu" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2502.16852", 17 "doi": "10.48550/arXiv.2502.16852" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval", "theoretical"], 22 "key_findings": "ONPO integrates optimistic online mirror descent into self-play RLHF, achieving an O(T⁻¹) duality gap bound improving upon the previous O(T⁻¹/²) result. On AlpacaEval 2.0, ONPO achieves 21.2% and 9.9% relative improvement over the strongest baseline (INPO) with Mistral-Instruct and Llama-3-8B respectively. Academic benchmarks show ONPO preserves base model capabilities without significant alignment tax. The method avoids needing to estimate win rates over the full policy, relying only on binary preference signals.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No repository URL or code archive is provided in the paper. No GitHub link, footnote, or supplementary code release is mentioned." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper uses publicly available datasets and models: the RLHFlow prompt collection (huggingface.co/datasets/RLHFlow/prompt-collection-v0.1), RLHFlow/LLaMA3-SFT, Mistral-7B-Instruct-v0.3, and the pair-preference-model. All evaluation benchmarks (AlpacaEval 2.0, Arena-Hard, MT-Bench) are public." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or environment setup details are provided. Appendix B lists training hyperparameters but not software versions, GPU types, or library dependencies." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are included. Algorithm 1 describes the method but there are no runnable scripts or README instructions for reproducing experiments." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Tables 1 and 2 report only point estimates with no confidence intervals or error bars. Figure 1 shows performance across η values but without uncertainty bands." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are performed. Claims that ONPO 'outperforms' baselines are based solely on comparing point estimates without any t-test, bootstrap, or other significance test." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports relative improvements with baseline context: '21.2% improvement on Mistral-It' (35.3→42.8 on AlpacaEval) and '9.9% improvement on Llama-3-SFT' (44.2→48.6). Absolute baseline and result values are provided in Table 1." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for sample sizes. The evaluation benchmarks are standard (805 AlpacaEval instructions, 500 Arena-Hard queries, 80 MT-Bench questions) but no power analysis or discussion of whether these are sufficient for the claimed comparisons." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or spread across runs is reported anywhere. All results appear to be from single runs with no multi-seed evaluation." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Three online baselines are compared: Iterative DPO (Dong et al., 2024), SPPO (Wu et al., 2024), and INPO (Zhang et al., 2024). Additional reference models (Llama-3-70B-it, GPT-4, Claude-3-Opus, etc.) are shown in Table 1." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "All three main baselines (Iterative DPO, SPPO, INPO) are from 2024, representing current state-of-the-art online general preference alignment methods." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "The comparison between ONPO and INPO (by the same lead author) effectively ablates the key innovation: INPO uses standard OMD while ONPO adds the optimistic predictor. Section 6.3 provides a hyperparameter sensitivity analysis varying η across 5 values." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Three main benchmarks are used: AlpacaEval 2.0 (LC win rate), Arena-Hard (win rate), and MT-Bench (1-10 GPT-4 rating). Table 2 adds six academic benchmarks (GPQA, MMLU-Pro, Hellaswag, Winogrande, TruthfulQA, GSM8K)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation is included. All evaluation uses GPT-4 Preview-1106 as the automated judge. The paper's core claim is about alignment with human preferences, making human evaluation relevant." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Evaluation is on standard held-out benchmarks (AlpacaEval 2.0, Arena-Hard, MT-Bench) that are separate from the training prompt collection (RLHFlow/prompt-collection-v0.1). No tuning is done on the test sets." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 2 provides per-benchmark breakdowns across six academic tasks (GPQA, Hellaswag, MMLU-Pro, Winogrande, TruthfulQA, GSM8K) plus the average. Table 1 breaks down results by benchmark and base model." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "No failure cases, error analysis, or qualitative examples of where ONPO produces poor outputs are discussed. The paper only shows aggregate performance numbers." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": false, 117 "justification": "Table 1 underlines cases where baselines outperform ONPO (e.g., INPO gets 37.0 vs ONPO's 36.4 on Arena-Hard with Llama-3-SFT), but these are not discussed or analyzed. The text says ONPO 'consistently outperforms or achieves comparable performance' without investigating the losses." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims ONPO 'outperforms state-of-the-art RLHF algorithms across multiple representative benchmarks.' Table 1 shows ONPO wins on 5/6 benchmark-model combinations and is close on the 6th. The O(T⁻¹) theoretical bound is proved in Theorem 4.2." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The key causal claim — that the optimistic predictor enables faster convergence — is justified by formal theoretical analysis (Theorem 4.2) proving the O(T⁻¹) bound, plus the controlled comparison to INPO which isolates the optimistic component. The theoretical proof provides adequate causal mechanism." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims 'LLM General Preference Alignment' but experiments use only two 7-8B parameter models. No testing on larger models, different model families, or non-English settings. The results are specific to single-turn alignment with a particular preference oracle." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No alternative explanations are discussed. The paper doesn't consider whether implementation details, hyperparameter tuning advantages, or the specific preference model choice could explain the results, beyond the theoretical motivation." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper measures GPT-4 judge win rates and frames this as 'alignment with human preferences' without discussing the gap. The mention that AlpacaEval LC has 0.98 Spearman correlation with Chatbot Arena partially bridges this but the limitations of automated judges as proxies for human preferences are not discussed." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions with HuggingFace URLs are provided: RLHFlow/LLaMA3-SFT (based on Llama-3-8B), Mistral-7B-Instruct-v0.3, RLHFlow/pair-preference-model-LLaMA3-8B, and GPT-4 Preview-1106 as the judge model." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "No actual prompt text is provided. The training prompt collection is referenced by URL but the prompts themselves are not shown. Judge prompts for AlpacaEval/Arena-Hard/MT-Bench are inherited from those benchmarks but not reproduced." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Appendix B reports: cosine learning rate scheduler, peak learning rate 5×10⁻⁷, warm-up ratio 0.03, global batch size 128, 1/η grid search over [0.1, 0.05, 0.02, 0.01, 0.005] with 1/η=0.01 selected, K=8 responses per prompt, 5 iterations for Llama-3-SFT and 3 for Mistral-Instruct." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. This is a training/alignment algorithm paper, not an agentic system." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": false, 171 "justification": "The online training loop is described in Algorithm 1 (sample K=8 responses, tournament selection for yw/yl), but the prompt collection preprocessing is not documented. The paper defers training details to Dong et al. (2024) rather than documenting the full pipeline." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "No dedicated limitations section. Section 7 ('Conclusion and Future Work') mentions multi-turn implementation and active data selection as future work, but does not discuss limitations of the current work. The Impact Statement is generic boilerplate." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No specific threats to validity are discussed. No mention of threats from the automated judge, limited model sizes, single preference oracle, or single-run evaluations." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries. Section 5.2 notes the multi-turn setting is left for future work, but the paper does not explicitly state what the results do NOT show or which settings are excluded from the claims." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "No raw experimental data (model outputs, preference judgments, per-example scores) is released. Only aggregate benchmark scores are reported in tables." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "The data generation process is described: at each iteration, K=8 responses are sampled from the current policy using prompts from the RLHFlow prompt collection, then the preference oracle (pair-preference-model) provides binary feedback via tournament selection (Section 6.1)." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Data sources are standard public benchmarks and a public prompt collection." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": false, 210 "justification": "Algorithm 1 documents the high-level pipeline, but key details are deferred to Dong et al. (2024), including training details for the preference model, prompt collection construction, and the full experimental workflow." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed. There is no acknowledgments section mentioning grants or sponsors despite authors being from UIUC and Tencent AI Lab." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: University of Illinois Urbana-Champaign and Tencent AI Lab, Bellevue." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Tencent AI Lab, a corporate research lab, employs most of the authors and has commercial interest in RLHF improvements. No funding independence discussion is provided." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is included in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "The training data cutoff dates for Llama-3-8B and Mistral-7B-v0.3 base models are not stated. Without this, it is impossible to assess whether evaluation benchmarks could be in the pre-training data." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether AlpacaEval, Arena-Hard, MT-Bench, or academic benchmarks (MMLU-Pro, GPQA, etc.) overlap with the base models' pre-training data." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "Several benchmarks (Hellaswag 2019, Winogrande 2021, GSM8K 2021, TruthfulQA 2021) were published well before Llama-3 and Mistral-v0.3 training. Potential contamination is not addressed." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. All evaluation is automated." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, latency, or per-example cost is reported. The method requires K=8 response generations plus preference model queries per training iteration, but the cost of this pipeline is not quantified." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No GPU hours, hardware specifications, or total training time are reported. The paper describes hyperparameters (Appendix B) but not the computational resources required." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No multi-seed results are reported. All tables show single point estimates without any indication of seed sensitivity." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is never stated. Results appear to be from single runs." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": true, 315 "justification": "Appendix B states a grid search for 1/η over [0.1, 0.05, 0.02, 0.01, 0.005] (5 values). Figure 1 shows performance across different η values." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "Appendix B states '1/η = 0.01' was selected but does not specify whether this was chosen on validation or test data, or what selection criterion was used." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Multiple comparisons are made across 9 benchmarks and 2 base models, but no statistical tests are performed and no correction for multiple comparisons is applied." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of author-evaluation bias. It is unclear whether the authors re-implemented baselines or used original code. The lead author is also the lead author of INPO (Zhang et al., 2024), one of the main baselines." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "No comparison of compute budgets across methods. ONPO uses a two-step update per iteration (vs one step for INPO) but the additional compute cost is not quantified or discussed." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether AlpacaEval, Arena-Hard, or MT-Bench actually measure alignment quality. The paper cites the 0.98 Spearman correlation with Chatbot Arena for AlpacaEval but does not question whether these benchmarks capture general preference alignment." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is involved. This is a training algorithm comparison where the same online RLHF workflow is used across methods." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of temporal leakage. Several academic benchmarks (Hellaswag 2019, GSM8K 2021) predate the base models' training and solutions may be in training data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup leaks information. The tournament-style preference selection during training could introduce biases not addressed." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of independence between training prompts and evaluation benchmarks, or whether AlpacaEval/Arena-Hard prompts overlap with the RLHFlow prompt collection." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention methods are applied. No canary strings, decontamination, or membership inference analysis." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "ONPO achieves an O(T⁻¹) duality gap bound, improving upon the previous O(T⁻¹/²) result of INPO.", 374 "evidence": "Theorem 4.2 provides the formal proof with DualGap(π̄) ≤ 4√D/T. The proof in Appendix A.2 leverages the RVU property of optimistic OMD where stability terms cancel out.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "ONPO achieves 21.2% relative improvement over the strongest baseline on AlpacaEval 2.0 with Mistral-Instruct.", 379 "evidence": "Table 1: ONPO achieves 42.8 LC win rate vs INPO's 35.3 on AlpacaEval 2.0 with Mistral-Instruct. (42.8-35.3)/35.3 ≈ 21.2%.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "ONPO achieves 9.9% relative improvement over the strongest baseline on AlpacaEval 2.0 with Llama-3-SFT.", 384 "evidence": "Table 1: ONPO achieves 48.6 LC win rate vs INPO's 44.2. (48.6-44.2)/44.2 ≈ 9.9%. However, no error bars are reported, so the significance of this difference is unclear.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "ONPO consistently outperforms or achieves comparable performance to baselines across both base models.", 389 "evidence": "Table 1 shows ONPO wins on 5 of 6 benchmark-model combinations. On Arena-Hard with Llama-3-SFT, INPO scores 37.0 vs ONPO's 36.4 — a small loss that is underlined but not discussed.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "ONPO does not over-align the model and preserves intrinsic knowledge and abilities.", 394 "evidence": "Table 2 shows ONPO achieves 55.4 average across 6 academic benchmarks with Mistral-Instruct, slightly above the base model (54.6) and all baselines (54.3-55.1). The differences are very small.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "ONPO is robust to hyperparameter η variations.", 399 "evidence": "Figure 1 shows ONPO outperforms the best baseline across all tested η values on both Arena-Hard and AlpacaEval 2.0. However, only 5 values of 1/η are tested and no error bars are shown.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No error bars or variance across runs", 406 "detail": "All results are single point estimates without any uncertainty quantification. For claims of 'outperforming' baselines where margins are sometimes small (e.g., 36.4 vs 37.0 on Arena-Hard), statistical significance is unknown." 407 }, 408 { 409 "flag": "No human evaluation for alignment claims", 410 "detail": "The paper claims to improve 'alignment with human preferences' but all evaluation uses GPT-4 as an automated judge. No human evaluation is conducted to validate that the improvements transfer to actual human preference." 411 }, 412 { 413 "flag": "Self-comparison with lead author's prior work", 414 "detail": "The lead author (Yuheng Zhang) is also the lead author of INPO (Zhang et al., 2024), the main baseline being compared against. The implementation details of both methods may not be independently verified." 415 }, 416 { 417 "flag": "No limitations section", 418 "detail": "The paper has no dedicated discussion of limitations, threats to validity, or scope boundaries. The Impact Statement is generic boilerplate: 'none of which we feel must be specifically highlighted here.'" 419 }, 420 { 421 "flag": "Overclaiming in title scope", 422 "detail": "The title claims 'LLM General Preference Alignment' but experiments are limited to two 7-8B models, single-turn settings, one preference oracle, and English-only benchmarks. The generality claimed far exceeds what was tested." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Direct preference optimization: Your language model is secretly a reward model", 428 "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell", "C. D. Manning", "S. Ermon", "C. Finn"], 429 "year": 2024, 430 "relevance": "Foundational offline preference alignment algorithm (DPO) that ONPO builds upon and compares against." 431 }, 432 { 433 "title": "Self-play preference optimization for language model alignment", 434 "authors": ["Y. Wu", "Z. Sun", "H. Yuan", "K. Ji", "Y. Yang", "Q. Gu"], 435 "year": 2024, 436 "arxiv_id": "2405.00675", 437 "relevance": "Self-play alignment approach (SPPO) that is a direct baseline in the experiments." 438 }, 439 { 440 "title": "Iterative nash policy optimization: Aligning llms with general preferences via no-regret learning", 441 "authors": ["Y. Zhang", "D. Yu", "B. Peng", "L. Song", "Y. Tian", "M. Huo", "N. Jiang", "H. Mi", "D. Yu"], 442 "year": 2024, 443 "arxiv_id": "2407.00617", 444 "relevance": "Direct predecessor to ONPO (INPO); uses standard OMD for self-play alignment. Main baseline and ablation reference." 445 }, 446 { 447 "title": "A general theoretical paradigm to understand learning from human preferences", 448 "authors": ["M. G. Azar", "Z. D. Guo", "B. Piot", "R. Munos", "M. Rowland", "M. Valko", "D. Calandriello"], 449 "year": 2024, 450 "relevance": "First work on general preference alignment (IPO) without BT model assumption, foundational to this line of research." 451 }, 452 { 453 "title": "Nash learning from human feedback", 454 "authors": ["R. Munos", "M. Valko", "D. Calandriello", "M. G. Azar", "M. Rowland", "Z. D. Guo", "Y. Tang", "M. Geist", "T. Mesnard", "A. Michi"], 455 "year": 2023, 456 "arxiv_id": "2312.00886", 457 "relevance": "First to formulate alignment as a two-player zero-sum game (Nash-MD), the framework ONPO operates within." 458 }, 459 { 460 "title": "Direct nash optimization: Teaching language models to self-improve with general preferences", 461 "authors": ["C. Rosset", "C.-A. Cheng", "A. Mitra", "M. Santacroce", "A. Awadallah", "T. Xie"], 462 "year": 2024, 463 "arxiv_id": "2404.03715", 464 "relevance": "Alternative general preference alignment method (DNO) compared in the discussion section." 465 }, 466 { 467 "title": "RLHF workflow: From reward modeling to online RLHF", 468 "authors": ["H. Dong", "W. Xiong", "B. Pang", "H. Wang", "H. Zhao", "Y. Zhou", "N. Jiang", "D. Sahoo", "C. Xiong", "T. Zhang"], 469 "year": 2024, 470 "arxiv_id": "2405.07863", 471 "relevance": "Provides the online RLHF workflow implementation that ONPO follows, including iterative DPO baseline." 472 }, 473 { 474 "title": "Human alignment of large language models through online preference optimisation", 475 "authors": ["D. Calandriello", "D. Guo", "R. Munos", "M. Rowland", "Y. Tang", "B. Pires", "P. H. Richemond", "C. L. Lan", "M. Valko", "T. Liu"], 476 "year": 2024, 477 "arxiv_id": "2403.08635", 478 "relevance": "Online IPO algorithm for general preference alignment that ONPO is compared against in Section 5." 479 }, 480 { 481 "title": "Training language models to follow instructions with human feedback", 482 "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida", "C. Wainwright", "P. Mishkin"], 483 "year": 2022, 484 "relevance": "Foundational RLHF work (InstructGPT) establishing the reward model + PPO pipeline that subsequent alignment work builds upon." 485 }, 486 { 487 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 488 "authors": ["Y. Bai", "A. Jones", "K. Ndousse", "A. Askell"], 489 "year": 2022, 490 "arxiv_id": "2204.05862", 491 "relevance": "RLHF alignment methodology for Claude, demonstrating the practical importance of alignment research." 492 }, 493 { 494 "title": "Proximal policy optimization algorithms", 495 "authors": ["J. Schulman", "F. Wolski", "P. Dhariwal", "A. Radford", "O. Klimov"], 496 "year": 2017, 497 "arxiv_id": "1707.06347", 498 "relevance": "PPO algorithm widely used in RLHF pipelines that ONPO aims to improve upon in terms of stability and compute." 499 }, 500 { 501 "title": "KTO: Model alignment as prospect theoretic optimization", 502 "authors": ["K. Ethayarajh", "W. Xu", "N. Muennighoff", "D. Jurafsky", "D. Kiela"], 503 "year": 2024, 504 "arxiv_id": "2402.01306", 505 "relevance": "Alternative alignment algorithm using prospect theory, representing a different approach to preference learning." 506 }, 507 { 508 "title": "Multi-turn reinforcement learning from preference human feedback", 509 "authors": ["L. Shani", "A. Rosenberg", "A. Cassel", "O. Lang", "D. Calandriello", "A. Zipori"], 510 "year": 2024, 511 "arxiv_id": "2405.14655", 512 "relevance": "Multi-turn RLHF formulation as contextual MDP that ONPO discusses extending to in Section 5.2." 513 } 514 ], 515 "engagement_factors": { 516 "practical_relevance": { 517 "score": 2, 518 "justification": "RLHF practitioners could implement ONPO following Algorithm 1 and the described hyperparameters, but no code is released to make adoption immediate." 519 }, 520 "surprise_contrarian": { 521 "score": 1, 522 "justification": "Challenges the Bradley-Terry assumption (a known limitation) and improves convergence rate from O(T⁻¹/²) to O(T⁻¹), but the general direction is incremental within established game-theoretic alignment." 523 }, 524 "fear_safety": { 525 "score": 0, 526 "justification": "No safety or risk concerns raised; this is an alignment improvement paper that aims to make models better aligned." 527 }, 528 "drama_conflict": { 529 "score": 0, 530 "justification": "No controversy or conflict; straightforward algorithmic improvement with standard benchmark comparisons." 531 }, 532 "demo_ability": { 533 "score": 0, 534 "justification": "No code, demo, or pip-installable tool released; implementation would require significant effort." 535 }, 536 "brand_recognition": { 537 "score": 1, 538 "justification": "Tencent AI Lab is a known corporate lab and UIUC is a respected university, but neither carries the social media attention of OpenAI/Anthropic/Google." 539 } 540 } 541 }