scan.json (30143B)
1 { 2 "paper": { 3 "title": "Relative Preference Optimization: Enhancing LLM Alignment through Contrasting Responses across Identical and Diverse Prompts", 4 "authors": [ 5 "Yueqin Yin", 6 "Zhendong Wang", 7 "Yi Gu", 8 "Hai Huang", 9 "Weizhu Chen", 10 "Mingyuan Zhou" 11 ], 12 "year": 2024, 13 "venue": "arXiv.org", 14 "arxiv_id": "2402.10958", 15 "doi": "10.48550/arXiv.2402.10958" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "RPO extends DPO by constructing a contrast matrix that compares preferred/dispreferred responses across semantically related prompts, not just identical ones, using embedding-based similarity reweighting. On Anthropic-HH, OpenAI Summarization, and AlpacaEval2.0, RPO-Paired achieves 78.52% GPT-4 win rate vs DPO's 72.26% on Mistral-7B, and 38.88% on AlpacaEval2.0 vs DPO's 30.84%. Ablations show that naive uniform or diagonal weighting underperforms DPO, confirming that semantic similarity reweighting is the critical component. RPO also works with unpaired preference data, extending applicability beyond paired settings.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract provides a GitHub link: https://github.com/yinyueqin/relative-preference-optimization. Appendix G also includes the core Python implementation of the RPO loss function." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper uses publicly available datasets: Anthropic's Helpful and Harmless dataset (170k dialogues) and OpenAI's Summarization dataset. AlpacaEval2.0 is also a public benchmark. No proprietary data was collected." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "Table 5 lists some hyperparameters and mentions '8 Nvidia A100 GPUs' and RMSProp optimizer, but no requirements.txt, Dockerfile, or library versions are provided. Not enough detail to recreate the environment." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are included in the paper. A GitHub link is provided and hyperparameters are listed in Table 5, but the paper lacks a 'Reproducing Results' section with specific commands to run." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results in Tables 1-4 are point estimates (e.g., '78.52' win rate). No confidence intervals, error bars, or ± notation appear anywhere in the paper." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims RPO 'significantly outperforms' baselines (abstract, Section 4.3) but provides no statistical significance tests — no p-values, t-tests, or bootstrap tests. Comparisons are made by simply comparing two numbers." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Tables 1-4 report both RPO and baseline win rates, providing context for the magnitude of improvement. For example, RPO-Paired achieves 78.52 vs DPO's 72.26 on Anthropic-HH with Mistral-7B (Table 4), allowing readers to assess the 6.26 percentage point improvement." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "Table 5 states '256' comparisons for computing win rate, and the ablation studies note '256 samples' from the test set. No justification is given for why 256 is sufficient, and no power analysis is discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No standard deviations, variance, or any spread measures are reported across experimental runs. All results appear to be single-run point estimates." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Table 4 compares RPO against SFT, PPO, IPO, DPO, and KTO — a comprehensive set of alignment baselines." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include DPO (NeurIPS 2023), IPO (AISTATS 2024), and KTO (2024), all very recent and state-of-the-art methods for preference alignment." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Section 4.2 presents extensive ablation studies: Table 1 ablates weighting strategies (uniform, diagonal, embedding), Table 2 ablates embedding extraction models and temperature, Table 3 ablates batch size. Appendix F adds further ablations on prompt-only vs. prompt-response weighting, beta values, and sampling temperature." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": false, 90 "justification": "The sole evaluation metric is GPT-4 win rate, applied across different datasets (HH, Summarization, AlpacaEval2.0). While evaluated on multiple tasks, there is only one type of metric — no complementary metrics such as perplexity, BLEU, ROUGE, or human ratings are used." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation is conducted. The paper uses GPT-4 as the sole evaluator, describing it as 'a stand-in for human evaluators' (Section 4.1)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 4.1 states evaluations were conducted on 'the validation sets of Anthropic's HH Dataset for dialogue and the OpenAI Summarization Dataset for summarization.' AlpacaEval2.0 uses a separate set of 805 prompts." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 4 breaks down results by model (LLaMA2-7B, LLaMA2-13B, Mistral-7B) and by task (Anthropic-HH, OpenAI Summarization, AlpacaEval2.0). Ablation tables provide further breakdowns." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": false, 110 "justification": "No qualitative failure analysis or error analysis is provided. The generation examples in Appendix I show only successful outputs without discussing cases where RPO fails or produces poor responses." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table 1 reports that Uniform Weighting (68.36) and Diagonal Weighting (69.92) both underperform DPO (72.26). Table 3 shows small batch sizes (2) underperform DPO. Table 6 shows integrated prompt-response weighting reduces generalizability on AlpacaEval." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims 'superior ability to align LLMs' on dialogue, summarization, and AlpacaEval2.0. Table 4 supports this — RPO-Paired achieves the highest win rates in most settings. The claim about handling 'both paired and unpaired sets' is demonstrated in Table 4 with separate RPO-Unpaired and RPO-Paired results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper's causal claims (e.g., 'reweighting based on prompt similarities significantly enriches model alignment') are supported by controlled ablation studies. Table 1 systematically varies the weighting strategy while holding other factors constant. Table 2 ablates embedding models, and Table 3 ablates batch size — each is a controlled single-variable manipulation." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims 'Enhancing LLM Alignment' broadly, but results are limited to 3 models (LLaMA2-7B, LLaMA2-13B, Mistral-7B) on 2 datasets plus AlpacaEval. The abstract's claim of 'superior ability to align LLMs with user preferences' is not bounded to the tested models or tasks." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not consider alternative explanations for RPO's improvements. For example, it does not discuss whether the gains could be due to implicit data augmentation (seeing more contrastive pairs per batch), regularization effects of the reweighting, or other confounds beyond the proposed semantic similarity mechanism." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper uses GPT-4 win rate as a proxy for human preference alignment. While it acknowledges GPT-4 as 'a stand-in for human evaluators' (Section 4.1), it does not discuss the gap between GPT-4 judgments and actual human preferences, known biases of LLM-as-judge (e.g., verbosity bias, position bias), or limitations of this proxy." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "The paper specifies 'LLaMA2-7B', 'LLaMA2-13B', and 'Mistral-7B' as base models. Table 5 specifies the GPT-4 judge version as 'gpt-4-0613' and AlpacaEval judge as 'alpaca_eval_gpt4_turbo_fn'. Embedding model specified as 'all-MiniLM-L6-v2'." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Appendix H provides the full GPT-4 evaluation prompts for both dialogue and summarization tasks. The training data comes from standard datasets, so training prompts are implicitly available." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Table 5 in Appendix D provides comprehensive hyperparameters: batch size 64, learning rate 5e-7, 1 epoch, β=0.1, τ values, sampling temperature 0, optimizer (RMSProp), max prompt length 256, max total length 512." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. RPO is a training-time preference optimization method, not an agentic system." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.1 describes the data processing: in paired settings, each batch has N triplets (x, yw, yl); in unpaired settings, triplets are deconstructed into (x, yw) and (x, yl) pairs, shuffled, and N instances of each are sampled. The SFT phase using preferred responses is also described." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5 includes a 'Limitations & Future Work' subsection identifying three specific limitations: dependency on embedding model quality, contrast matrix limited by single-GPU mini-batch memory, and the constant Z(x) assumption." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "The three limitations are specific to RPO: (1) a weak text encoder may fail to capture prompt similarities, (2) contrastive matrix size is bounded by GPU memory within a single mini-batch, and (3) the algorithm assumes constant Z(x) across all prompts. These are not generic disclaimers." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge that results are limited to 7B-13B models, English-only text, or the specific task domains tested. No explicit boundary statements like 'our results apply only to...' are present." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The underlying training datasets (Anthropic-HH, OpenAI Summarization) are public, but the experimental outputs — model generations, GPT-4 evaluation judgments, and per-sample win/loss decisions — are not released. Only aggregated win rates are reported." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 4.1 describes the datasets in detail: Anthropic-HH has '170k dialogues, each comprising a human query and paired model responses rated for helpfulness and harmlessness.' The OpenAI Summarization dataset structure is also described." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants were recruited. All data comes from standard public benchmarks (Anthropic-HH, OpenAI Summarization, AlpacaEval2.0)." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section 4.1 documents the pipeline: preferred responses used for SFT, then preference pairs processed in paired or unpaired settings. The unpaired construction is explicitly described (deconstruct triplets, shuffle, extract N instances)." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding acknowledgments or grant information appears in the paper. Authors are affiliated with UT Austin, Microsoft Azure AI, and Google, but no funding sources are disclosed." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed on the first page: The University of Texas at Austin, Microsoft Azure AI, and Google." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Authors are affiliated with Microsoft Azure AI and Google, both of which have commercial interests in LLM alignment technology. No funding source is disclosed, making independence unverifiable. The institutional affiliations suggest potential non-independence." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement appears in the paper. Authors from Microsoft and Google may have relevant financial interests but these are not declared." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for LLaMA2 or Mistral-7B. These models' pretraining data could include the evaluation datasets (Anthropic-HH published 2022, OpenAI Summarization published 2020), but this is not discussed." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of potential overlap between the base models' pretraining data and the evaluation datasets. Since LLaMA2 was released in 2023 and the datasets predate it, contamination is possible." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "Both the Anthropic-HH dataset (2022) and OpenAI Summarization dataset (2020) were publicly available before the training cutoffs of LLaMA2 (2023) and Mistral (2023). This contamination risk is not addressed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. The paper evaluates training methods using automated benchmarks and GPT-4 as judge." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost or latency is reported. The additional computational overhead of computing prompt embeddings and the contrast matrix is not quantified." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The paper mentions '8 Nvidia A100 GPUs' (Table 5) but does not state total GPU hours, training wall-clock time, or total compute budget for the experiments." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of running experiments with multiple random seeds. All results appear to be from single runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never explicitly stated. Results are presented as single values without indicating whether they are from one run or averaged over multiple." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "The ablation studies explore several hyperparameter values (τ, embedding models, batch sizes, β values, sampling temperatures), but the total computational budget for this search is not reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "The configuration selection is justified through systematic ablation. Table 2 shows all-MiniLM-L6-v2 at τ=0.5 was selected based on performance on Anthropic-HH. Table 1 justifies embedding reweighting over alternatives. Section 4.2 explains the reasoning." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "Multiple comparisons are made across methods, models, datasets, and hyperparameter settings without any correction for multiple comparisons (Bonferroni, Holm, etc.)." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors implement all baselines themselves and compare RPO against these implementations. No discussion of potential author-evaluation bias or reference to independent baseline implementations." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "RPO introduces additional compute overhead from embedding computation and contrast matrix construction compared to DPO, but performance is not reported as a function of compute budget. No matched-compute comparison is provided." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper uses GPT-4 win rate as the sole evaluation metric without discussing whether it actually measures alignment with human preferences. It cites Zheng et al. (2023) and Li et al. (2023) to justify GPT-4 as evaluator but does not discuss construct validity limitations." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. RPO is a training-time method and all comparisons use the same inference setup." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "The evaluation datasets (Anthropic-HH 2022, OpenAI Summarization 2020) were publicly available before the base models (LLaMA2, Mistral) were trained in 2023. This temporal leakage risk is not discussed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether evaluation setup or features could leak answer information. The GPT-4 evaluation prompts in Appendix H show both responses simultaneously, but position bias is not addressed." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of potential overlap or structural similarity between training and test splits of the datasets used." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention method is applied — no canary strings, membership inference, decontamination, or temporal analysis." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "RPO with embedding reweighting significantly outperforms DPO on dialogue tasks", 372 "evidence": "Table 1 shows RPO Embedding Reweighting (Paired, τ=0.5) achieves 78.52% win rate vs DPO's 72.26% on Anthropic-HH with Mistral-7B. Table 4 shows consistent improvements across LLaMA2-7B (68.75 vs 63.67) and LLaMA2-13B (72.66 vs 63.28).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "Semantically related prompts serve as effective contrastive pairs only with proper reweighting", 377 "evidence": "Table 1 shows Uniform Weighting (68.36) and Diagonal Weighting (69.92) both underperform DPO (72.26), while Embedding Reweighting surpasses it (78.52). This ablation demonstrates that naive cross-prompt contrast is harmful but similarity-weighted contrast helps.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "RPO works with both paired and unpaired preference data", 382 "evidence": "Table 4 shows RPO-Unpaired achieves competitive results: 75.00 on Anthropic-HH (vs DPO's 72.26), 50.39 on Summarization (vs DPO's 48.83), and 31.24 on AlpacaEval2.0 (vs DPO's 30.84). In some settings RPO-Unpaired matches or exceeds RPO-Paired.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Larger batch sizes improve RPO performance", 387 "evidence": "Table 3 shows monotonic improvement from batch size 2 (71.48%) to 8 (78.52%) per GPU on Anthropic-HH with Mistral-7B.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "RPO outperforms all state-of-the-art alignment methods across tasks", 392 "evidence": "Table 4 shows RPO-Paired achieves the highest win rates in 4 of 5 settings. However, on OpenAI Summarization, RPO-Unpaired (50.39) and RPO-Paired (50.00) are close to the baseline, and all results lack error bars or significance tests.", 393 "supported": "moderate" 394 } 395 ], 396 "red_flags": [ 397 { 398 "flag": "GPT-4 as sole evaluator", 399 "detail": "All results rely exclusively on GPT-4 win rate with no human evaluation. Known biases of LLM-as-judge (verbosity preference, position bias) are not discussed. The paper's core claim about 'alignment with human preferences' is evaluated without any human involvement." 400 }, 401 { 402 "flag": "No error bars or variance reporting", 403 "detail": "All results across Tables 1-4 and Appendix F are single-run point estimates with no standard deviations, confidence intervals, or multiple-seed results. The word 'significantly' is used in claims without any statistical significance testing." 404 }, 405 { 406 "flag": "Small evaluation sample size", 407 "detail": "Win rates are computed on only 256 samples (Table 5), which is relatively small. With no variance reporting, it is unclear whether the observed differences (e.g., 78.52 vs 72.26) are statistically meaningful or within sampling noise." 408 }, 409 { 410 "flag": "Potential conflicts of interest undisclosed", 411 "detail": "Authors are affiliated with Microsoft Azure AI and Google, both companies with significant commercial interests in LLM alignment technology. No funding, competing interests, or financial interest statements appear in the paper." 412 }, 413 { 414 "flag": "Claims of 'significant' improvement without significance tests", 415 "detail": "The abstract and Section 4.3 use language like 'significantly outperforms' and 'superior ability' without any statistical tests. The observed improvements could be within noise of single-run experiments." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "Direct preference optimization: Your language model is secretly a reward model", 421 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D Manning", "Chelsea Finn"], 422 "year": 2023, 423 "relevance": "The foundational method that RPO extends; core baseline for LLM alignment without reward models." 424 }, 425 { 426 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 427 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 428 "year": 2022, 429 "arxiv_id": "2204.05862", 430 "relevance": "Source of the Anthropic-HH dataset used for evaluation; foundational work on RLHF alignment." 431 }, 432 { 433 "title": "Training language models to follow instructions with human feedback", 434 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 435 "year": 2022, 436 "relevance": "Foundational RLHF work establishing the reward-model-based alignment pipeline that DPO and RPO seek to improve upon." 437 }, 438 { 439 "title": "A general theoretical paradigm to understand learning from human preferences", 440 "authors": ["Mohammad Gheshlaghi Azar", "Mark Rowland", "Bilal Piot"], 441 "year": 2024, 442 "relevance": "Proposes IPO (Identity Preference Optimization), a key baseline for preference alignment that addresses DPO overfitting." 443 }, 444 { 445 "title": "KTO: Model alignment as prospect theoretic optimization", 446 "authors": ["Kawin Ethayarajh", "Winnie Xu", "Niklas Muennighoff", "Dan Jurafsky", "Douwe Kiela"], 447 "year": 2024, 448 "arxiv_id": "2402.01306", 449 "relevance": "Proposes KTO for unpaired preference optimization using prospect theory; key baseline and concurrent work to RPO." 450 }, 451 { 452 "title": "Proximal policy optimization algorithms", 453 "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal", "Alec Radford", "Oleg Klimov"], 454 "year": 2017, 455 "arxiv_id": "1707.06347", 456 "relevance": "PPO is the standard RL algorithm used in RLHF pipelines; baseline method for LLM alignment." 457 }, 458 { 459 "title": "Llama 2: Open foundation and fine-tuned chat models", 460 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 461 "year": 2023, 462 "arxiv_id": "2307.09288", 463 "relevance": "One of the base LLMs used for evaluation (LLaMA2-7B, LLaMA2-13B)." 464 }, 465 { 466 "title": "Mistral 7b", 467 "authors": ["Albert Q Jiang", "Alexandre Sablayrolles", "Arthur Mensch"], 468 "year": 2023, 469 "arxiv_id": "2310.06825", 470 "relevance": "Base LLM used for evaluation; the model on which RPO shows its strongest results." 471 }, 472 { 473 "title": "Learning to summarize with human feedback", 474 "authors": ["Nisan Stiennon", "Long Ouyang", "Jeffrey Wu"], 475 "year": 2020, 476 "relevance": "Source of the OpenAI Summarization dataset used for evaluation; foundational RLHF work for summarization." 477 }, 478 { 479 "title": "Smaug: Fixing failure modes of preference optimisation with dpo-positive", 480 "authors": ["Arka Pal", "Deep Karkhanis", "Samuel Dooley", "Manley Roberts", "Siddartha Naidu", "Colin White"], 481 "year": 2024, 482 "arxiv_id": "2402.13228", 483 "relevance": "Addresses failure modes in DPO-based preference optimization, relevant to understanding DPO limitations that RPO also targets." 484 }, 485 { 486 "title": "Judging llm-as-a-judge with mt-bench and chatbot arena", 487 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 488 "year": 2023, 489 "arxiv_id": "2306.05685", 490 "relevance": "Foundational work on using LLMs as evaluators, the methodology RPO relies on for all its evaluation results." 491 } 492 ], 493 "engagement_factors": { 494 "practical_relevance": { 495 "score": 2, 496 "justification": "RPO is a drop-in replacement for DPO that practitioners training LLMs could implement; code is provided on GitHub." 497 }, 498 "surprise_contrarian": { 499 "score": 1, 500 "justification": "The idea of cross-prompt contrastive pairs is novel but not deeply contrarian — it extends DPO in an intuitive direction." 501 }, 502 "fear_safety": { 503 "score": 0, 504 "justification": "No safety or security concerns raised; this is a preference alignment improvement method." 505 }, 506 "drama_conflict": { 507 "score": 0, 508 "justification": "No controversy or drama; straightforward methodological contribution." 509 }, 510 "demo_ability": { 511 "score": 1, 512 "justification": "Code is on GitHub but requires substantial compute (8 A100s) to train; not a pip-installable tool or demo." 513 }, 514 "brand_recognition": { 515 "score": 1, 516 "justification": "Authors from UT Austin, Microsoft, and Google — recognized institutions but not flagship AI lab releases." 517 } 518 } 519 }