scan.json (24437B)
1 { 2 "paper": { 3 "title": "AlphaPO: Reward Shape Matters for LLM Alignment", 4 "authors": [ 5 "Aman Gupta", 6 "Shao Tang", 7 "Qingquan Song", 8 "Sirou Zhu", 9 "Jiwoo Hong", 10 "Ankan Saha", 11 "Viral Gupta", 12 "Noah Lee", 13 "Eunki Kim", 14 "Siyu Zhu", 15 "Parag Agrawal", 16 "Natesh Pillai", 17 "S. Sathiya Keerthi" 18 ], 19 "year": 2025, 20 "venue": "ICML 2025", 21 "arxiv_id": "2501.03884" 22 }, 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No repository URL or code release link is provided in the paper. The paper references SimPO's GitHub for hyperparameters but does not release AlphaPO's own code." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper uses publicly available datasets: UltraFeedback (Cui et al., 2024), AlpacaEval 2.0, and Arena-Hard. The models used are publicly available on HuggingFace (Table 4 lists HuggingFace IDs)." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions '8×A100 GPUs with the adamw_torch optimizer based on the alignment-handbook' (Appendix A.6) but does not provide a requirements.txt, Dockerfile, or detailed library versions." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While hyperparameters are listed in Table 3 and Appendix A.6, there are no runnable commands or structured reproduction guide." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": true, 50 "justification": "Table 5 in Appendix A.7 reports 95% confidence intervals for Arena-Hard results (e.g., '95 CI high' and '95 CI low' columns). AlpacaEval 2 results include standard deviation (STD) columns." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims AlphaPO outperforms SimPO and DPO across multiple benchmarks but does not use any statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on point estimates." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports relative improvements with baseline context, e.g., 'AlphaPO leads to about 7% to 10% relative improvement in alignment performance for the instruct versions of Mistral-7B and Llama3-8B while achieving 15% to 50% relative improvement over DPO' (Abstract). Table 1 provides absolute numbers enabling effect size computation." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification is provided for why the specific benchmarks (805 prompts in AE2, Arena-Hard) are sufficient for the claims being made. No power analysis or sample size discussion." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": true, 70 "justification": "Table 5 reports standard deviation (STD) for AlpacaEval 2 win rates across instructions (e.g., STD 1.40% for AlphaPO on Mistral). Arena-Hard reports 95% confidence intervals." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper compares AlphaPO against DPO and SimPO across all experimental settings (Table 1)." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "SimPO (Meng et al., 2024) and DPO (Rafailov et al., 2023) are the state-of-the-art DAA methods. The paper also compares with SPPO (Wu et al., 2024). These are contemporary and competitive baselines." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper includes ablation studies on the effect of alpha (Figure 3 middle, Figure 8) and gamma (Figure 3 left, Figure 9) on AE2 performance. Section 4.2 systematically varies these parameters." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper evaluates on AlpacaEval 2.0 (length-controlled win rate and raw win rate), Arena-Hard (win rate), HellaSwag, and TruthfulQA (Table 2). Response length is also tracked." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation is included. All evaluation is automated: AlpacaEval 2.0 uses GPT-4 as judge, Arena-Hard uses automated comparison, and HellaSwag/TruthfulQA are automated benchmarks. The paper discusses 'human preferences' and 'human evaluations' conceptually but does not conduct any." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "The evaluation benchmarks (AlpacaEval 2.0 with 805 prompts, Arena-Hard) are separate from the training data (UltraFeedback). HellaSwag and TruthfulQA are also independent test sets." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 1 provides results broken down by model (Llama-3, Mistral, Gemma-2), by reward model (PairRM vs ArmoRM), and by benchmark. Table 5 provides further per-model detail. Win rate heatmaps in Appendix A.8 show per-instance comparisons." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper discusses that for Gemma-2, AlphaPO has slightly lower or comparable WR on AE2 and AH compared to SimPO (Section 4.2). The drop-off for large negative alpha values is discussed. The paper notes GPT-4's bias toward longer responses as a confound." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that AlphaPO does not improve over SimPO on Gemma-2 Arena-Hard (57.7 vs 59.0, Table 1). It also reports that large negative alpha values hurt performance (Figure 3, Figure 8), and discusses that f-DPO fails to improve with non-zero alpha (Section 3.2)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims '7% to 10% relative improvement' over SimPO and '15% to 50% relative improvement over DPO' for Mistral-7B and Llama3-8B. Table 1 supports these: e.g., Llama-3 LC goes from 42.05 (SimPO) to 45.37 (AlphaPO), about 7.9% relative; Mistral LC goes from 29.71 to 33.03, about 11.2% relative. DPO improvements are also confirmed." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper makes causal claims about the effect of alpha on alignment performance. These are supported by controlled single-variable ablations (varying alpha while fixing other hyperparameters, Section 4.2, Figure 3) and theoretical gradient analysis (Theorems 3.1, 3.2, Corollary 3.3)." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title 'Reward Shape Matters for LLM Alignment' makes a broad claim. Results are limited to three model families (7B-9B scale), one training dataset (UltraFeedback), and two evaluation benchmarks. The paper does not explicitly bound the generalization to these specific settings, nor discuss whether results hold for larger models or different domains." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for why AlphaPO outperforms SimPO beyond the reward shape analysis. For instance, it does not consider whether the improvement could stem from the additional hyperparameter search (alpha adds a new tuning dimension) rather than the reward shape itself. No threats-to-validity section addresses confounds." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Table 4 in Appendix A.6 provides exact HuggingFace model IDs: 'mistralai/Mistral-7B-Instruct-v0.2', 'meta-llama/Meta-Llama-3-8B-Instruct', 'google/gemma-2-9b-it'. These are specific, versioned identifiers." 147 }, 148 "prompts_provided": { 149 "applies": false, 150 "answer": false, 151 "justification": "The paper does not use prompting in the conventional sense. It trains models using preference optimization and evaluates on standard benchmarks. The prompts come from the benchmark datasets (AlpacaEval 2.0, Arena-Hard), not from the authors." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Table 3 (Appendix A.6) reports alpha, beta, gamma/beta, and learning rate for all models and methods. Additional details include batch size (128), max sequence length (2048), cosine learning rate schedule with warmup ratio 0.1, and decoding temperatures for evaluation." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. This is a standard preference optimization method for LLM training." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 4.1 describes the data preprocessing: 'we regenerate five responses for every prompt in the UF dataset using a sampling temperature of 0.8. We then use two reward models - PairRM and ArmoRM to rank the 5 responses. The highest scoring response is labeled yw and the lowest scoring response is labeled yl.'" 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "There is no dedicated limitations or threats-to-validity section. The paper has an 'Impact Statement' (Section after Conclusion) but it only states 'There are many potential societal consequences of our work, none which we feel must be specifically highlighted here.' This is not a limitations discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no consideration of confounds, evaluation bias, or limitations of the experimental setup." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the tested model sizes, the UltraFeedback dataset, or the specific evaluation benchmarks used." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "The raw experimental outputs (model generations, per-prompt scores) are not made available. Only aggregate results in tables and figures are shown." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.1 describes how training data was created: regenerating responses from the UltraFeedback dataset using sampling temperature 0.8, then ranking with PairRM or ArmoRM reward models." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants are involved. Data comes from standard public benchmarks and automated response generation." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The pipeline from UltraFeedback prompts through response regeneration, reward model ranking, to preferred/dispreferred pair selection is clearly documented in Section 4.1. The evaluation pipeline uses standard AlpacaEval 2.0 and Arena-Hard tooling." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources are disclosed. The paper lists LinkedIn Corporation as an affiliation but does not have an acknowledgments section disclosing funding." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: LinkedIn Corporation (most authors) and KAIST AI. The footnote notes some authors' work was 'done while at LinkedIn Corporation.'" 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "LinkedIn Corporation (a Microsoft subsidiary) has a direct commercial interest in LLM alignment methods. The majority of authors are LinkedIn employees. This creates a potential non-independent funding relationship, though the paper evaluates open-source models rather than LinkedIn's proprietary models." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper does not state the training data cutoff dates for Mistral-7B-Instruct-v0.2, Llama-3-8B-Instruct, or Gemma-2-9B-it. Since these models are being further fine-tuned, the pre-training cutoff is relevant to understanding potential benchmark contamination." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether AlpacaEval 2.0 or Arena-Hard prompts/answers could have appeared in the pre-training data of the base models." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "AlpacaEval was published in 2023 and HellaSwag in 2019, before the training cutoffs of the models used. No contamination analysis is provided. Arena-Hard is relatively newer but still no contamination discussion." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants are involved in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants are involved in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference cost or latency is reported for the trained models. The paper does not mention API costs or tokens consumed during evaluation." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": true, 293 "justification": "Appendix A.6 states: 'All the training experiments in this paper were conducted on 8×A100 GPUs... The training time for Mistral-Instruct and Llama-3-Instruct is around 2.3 hours, while Gemma-2-Instruct requires 3 hours.'" 294 } 295 } 296 }, 297 "claims": [ 298 { 299 "claim": "AlphaPO achieves 7% to 10% relative improvement in length-controlled win rate over SimPO on AlpacaEval 2.0 for Llama3-8B and Mistral-7B instruct models.", 300 "evidence": "Table 1 shows Llama-3: 45.37 vs 42.05 LC (7.9% relative), Mistral: 33.03 vs 29.71 LC (11.2% relative) with PairRM data.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "AlphaPO achieves 15% to 50% relative improvement over DPO on the same models.", 305 "evidence": "Table 1 shows Llama-3: 45.37 vs 39.18 LC (15.8% relative), Mistral: 33.03 vs 21.96 LC (50.4% relative) with PairRM data.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "The reward function shape significantly impacts preference-based alignment learning, with the alpha parameter controlling the intensity of likelihood displacement.", 310 "evidence": "Theoretical analysis in Theorems 3.1 and 3.2, supported by training dynamics experiments in Figures 2 and 7 showing non-monotonic behavior of likelihood displacement with varying alpha.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "A slightly positive value of alpha achieves the best alignment performance.", 315 "evidence": "Figure 3 (middle) and Figure 8 show peak AE2 performance at alpha=0.25 for Mistral and Llama-3, and alpha=0.1 for Gemma-2.", 316 "supported": "strong" 317 }, 318 { 319 "claim": "AlphaPO combined with SPPO achieves 47.42% length-controlled win rate on AlpacaEval 2.0.", 320 "evidence": "Section 4.2 reports AlphaPO+SPPO achieves 47.42% LC, improved from 45.37% for AlphaPO alone, using PairRM-based UltraFeedback data.", 321 "supported": "moderate" 322 }, 323 { 324 "claim": "AlphaPO outperforms SimPO on HellaSwag and TruthfulQA benchmarks.", 325 "evidence": "Table 2 shows marginal improvements: HellaSwag 0.7694 vs 0.7576 (Llama-3) and 0.8638 vs 0.8610 (Mistral); TruthfulQA 0.6142 vs 0.6078 (Llama-3) and 0.7127 vs 0.7061 (Mistral). Improvements are small and no statistical tests are applied.", 326 "supported": "weak" 327 } 328 ], 329 "methodology_tags": [ 330 "benchmark-eval", 331 "theoretical" 332 ], 333 "key_findings": "AlphaPO introduces a parameterized reward function shape (via an alpha parameter applied to alpha-divergence with length normalization) for direct alignment algorithms. Theoretical gradient analysis and empirical experiments demonstrate that the reward shape controls likelihood displacement intensity in a non-monotonic fashion. AlphaPO outperforms SimPO by 7-10% and DPO by 15-50% on AlpacaEval 2.0 length-controlled win rate for Llama-3-8B and Mistral-7B, though gains are smaller or absent for Gemma-2-9B. The method adds one hyperparameter (alpha) to SimPO's existing framework.", 334 "red_flags": [ 335 { 336 "flag": "No significance testing", 337 "detail": "All comparisons between AlphaPO, SimPO, and DPO are based on point estimates without any statistical significance tests. The improvements on HellaSwag and TruthfulQA (Table 2) are small enough that they could be within noise." 338 }, 339 { 340 "flag": "No limitations section", 341 "detail": "The paper lacks a dedicated limitations section. The 'Impact Statement' is boilerplate with no substantive discussion. No threats to validity, scope boundaries, or alternative explanations are provided." 342 }, 343 { 344 "flag": "Additional hyperparameter confound", 345 "detail": "AlphaPO introduces alpha as an additional tunable hyperparameter. The paper does not discuss whether the improvement could partly stem from having an additional dimension of hyperparameter search rather than the reward shape itself. SimPO has 3 hyperparameters; AlphaPO has 4." 346 }, 347 { 348 "flag": "No contamination analysis", 349 "detail": "Evaluation benchmarks (AlpacaEval, HellaSwag, TruthfulQA) predate the models' training. No analysis of whether benchmark data leaked into pre-training corpora." 350 }, 351 { 352 "flag": "Single-run results for main benchmarks", 353 "detail": "The main results in Table 1 appear to be from single best runs per hyperparameter configuration. While Table 5 reports STD across instructions, there is no report of variance across training seeds or runs." 354 } 355 ], 356 "cited_papers": [ 357 { 358 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 359 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D. Manning", "Stefano Ermon", "Chelsea Finn"], 360 "year": 2023, 361 "relevance": "Foundational DAA method that AlphaPO builds upon and compares against." 362 }, 363 { 364 "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward", 365 "authors": ["Yu Meng", "Mengzhou Xia", "Danqi Chen"], 366 "year": 2024, 367 "relevance": "Primary baseline; AlphaPO generalizes SimPO's log reward function with an alpha-parameterized family." 368 }, 369 { 370 "title": "Beyond Reverse KL: Generalizing Direct Preference Optimization with Diverse Divergence Constraints", 371 "authors": ["Chaoqi Wang", "Yibo Jiang", "Chenghao Yang", "Han Liu", "Yuxin Chen"], 372 "year": 2024, 373 "relevance": "f-DPO method that inspired AlphaPO's alpha-divergence reward, but without length normalization." 374 }, 375 { 376 "title": "Unintentional Unalignment: Likelihood Displacement in Direct Preference Optimization", 377 "authors": ["Noam Razin", "Sadhika Malladi", "Adithya Bhaskar", "Danqi Chen", "Sanjeev Arora", "Boris Hanin"], 378 "year": 2024, 379 "arxiv_id": "2410.08847", 380 "relevance": "Key prior work on likelihood displacement phenomenon that AlphaPO addresses." 381 }, 382 { 383 "title": "Self-Play Preference Optimization for Language Model Alignment", 384 "authors": ["Yue Wu", "Zhiqing Sun", "Huizhuo Yuan", "Kaixuan Ji", "Yiming Yang", "Quanquan Gu"], 385 "year": 2024, 386 "arxiv_id": "2405.00675", 387 "relevance": "SPPO method shown to be complementary to AlphaPO for further alignment improvements." 388 }, 389 { 390 "title": "ORPO: Monolithic Preference Optimization without Reference Model", 391 "authors": ["Jiwoo Hong", "Noah Lee", "James Thorne"], 392 "year": 2024, 393 "relevance": "Introduced length normalization for reward in preference optimization, a key ingredient in AlphaPO." 394 }, 395 { 396 "title": "Scaling Laws for Reward Model Overoptimization in Direct Alignment Algorithms", 397 "authors": ["Rafael Rafailov", "Yaswanth Chittepu", "Ryan Park", "Harshit Sikchi", "Joey Hejna", "Bradley Knox", "Chelsea Finn", "Scott Niekum"], 398 "year": 2024, 399 "relevance": "Studies reward over-optimization in DAAs, a phenomenon AlphaPO aims to mitigate." 400 }, 401 { 402 "title": "Understanding Likelihood Over-Optimisation in Direct Alignment Algorithms", 403 "authors": ["Zhengyan Shi", "Simon Land", "Acyr Locatelli", "Matthieu Geist", "Max Bartolo"], 404 "year": 2024, 405 "relevance": "Studies over-optimization in DAAs showing that controlled likelihood reduction benefits generalization." 406 }, 407 { 408 "title": "Tulu 3: Pushing Frontiers in Open Language Model Post-Training", 409 "authors": ["Nathan Lambert"], 410 "year": 2024, 411 "arxiv_id": "2411.15124", 412 "relevance": "Major open-source post-training effort that adopted SimPO's length normalization, demonstrating its practical impact." 413 }, 414 { 415 "title": "f-PO: Generalizing Preference Optimization with f-Divergence Minimization", 416 "authors": ["Jiayi Han", "Mingjian Jiang", "Yansong Song", "Jure Leskovec", "Stefano Ermon", "Minkai Xu"], 417 "year": 2024, 418 "arxiv_id": "2410.21662", 419 "relevance": "Related f-divergence approach to preference optimization addressing over-optimization." 420 }, 421 { 422 "title": "UltraFeedback: Boosting Language Models with Scaled AI Feedback", 423 "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"], 424 "year": 2024, 425 "relevance": "Primary training dataset used in all AlphaPO experiments." 426 } 427 ] 428 }