scan.json (31062B)
1 { 2 "paper": { 3 "title": "LLM Alignment as Retriever Optimization: An Information Retrieval Perspective", 4 "authors": [ 5 "Bowen Jin", 6 "Jinsung Yoon", 7 "Zhen Qin", 8 "Ziqi Wang", 9 "Wei Xiong", 10 "Yu Meng", 11 "Jiawei Han", 12 "Sercan Ö. Arık" 13 ], 14 "year": 2025, 15 "venue": "International Conference on Machine Learning", 16 "arxiv_id": "2502.03699", 17 "doi": "10.48550/arXiv.2502.03699" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval", "theoretical"], 22 "key_findings": "The paper establishes a systematic framework mapping LLM alignment to information retrieval, drawing parallels between LLMs/reward models and retrievers/rerankers. LARPO, the proposed alignment method based on IR ranking objectives (contrastive, LambdaRank, ListMLE), achieves 38.9% and 13.7% averaged relative improvement over baselines on AlpacaEval2 and MixEval-Hard. Empirical analyses show that listwise objectives outperform pairwise (DPO), harder negatives improve alignment, and larger candidate lists with memorization of prior iterations enhance performance.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper, abstract, or footnotes." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper uses publicly available datasets: UltraFeedback (Cui et al., 2024) for training, AlpacaEval2 (Dubois et al., 2024), MixEval (Ni et al., 2024), GSM8K (Cobbe et al., 2021), and MATH for evaluation. Baseline checkpoints are from Meng et al. (2024b). All are publicly accessible." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section is provided. The paper mentions model names but not software library versions or hardware environment specifications." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Algorithm 1 describes the method at a high level, and Appendix H provides some hyperparameter details, but there are no concrete commands or workflow to replicate results." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 2, 3, 4, and 6 are reported as point estimates only. No confidence intervals, error bars, or ± notation appears anywhere in the results." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims LARPO 'outperforms' baselines and reports 'improvements' based solely on comparing raw numbers. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported for any comparison." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports relative improvements with baseline context: '38.9% and 13.7% averaged relative improvements on AlpacaEval2 and MixEval-Hard' (Section 5). Tables show both baseline and LARPO scores, allowing readers to compute absolute differences." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification for the number of training prompts, evaluation examples, or generated responses per prompt (10) beyond what was inherited from the benchmark designs. No power analysis or sample size reasoning." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No standard deviations, variance across seeds, or interquartile ranges are reported. All results appear to be single-run numbers with no spread measures." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Extensive baseline comparisons are provided in Table 2, including RRHF, SLiC-HF, DPO, IPO, CPO, KTO, RDPO, SimPO, and Iterative DPO. Section 5 details baseline selection." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include SimPO (2024), RDPO (2024), KTO (2024), CPO (2024), and Iterative DPO (2024), which represent recent and competitive methods in direct preference optimization." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 6 provides detailed ablations: optimization objectives (Table 3, Section 6.1), hard negatives (Figure 4a-b, Section 6.2), and candidate list construction (Figure 4c, Table 4, Section 6.3). Each component is studied independently." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are reported on AlpacaEval2 (LC Win Rate and Win Rate), MixEval (Score), and MixEval-Hard (Score) — four metrics across two benchmarks." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "All evaluation is automated: AlpacaEval2 uses GPT-4 as judge, MixEval uses automated scoring. No human evaluation of model outputs is included." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Evaluation uses standard held-out benchmark test sets: AlpacaEval2's 805 questions, MixEval's separate test sets, and GSM8K's test set. Training is on UltraFeedback, which is separate from evaluation benchmarks." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by base model (Mistral-Base vs Mistral-Instruct in Table 2; Gemma2-2b-it vs Mistral-7b-it in Table 3), by benchmark (AlpacaEval2, MixEval, MixEval-Hard), and by objective variant." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Figure 4(b) discusses that 'much lower temperature could lead to less diverse responses and finally lead to LLM alignment performance drop.' Figure 4(c) shows diminishing returns with larger candidate lists. These are explicit discussions of where the approach has limitations." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Figure 4(b) shows performance degradation at very low temperatures. Table 3 shows contrastive sometimes outperforms listwise (Gemma2 MixEval-Hard). Not all LARPO variants uniformly win across all settings." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims '38.9% and 13.7% averaged improvement on AlpacaEval2 and MixEval-Hard.' Table 2 shows LARPO variants consistently outperform baselines. The claim is framed as 'averaged relative improvement' which is supported by the results, though the exact averaging computation is not shown inline." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper makes causal claims through ablation studies: 'harder negatives lead to a more performant LLM' (Section 6.2), 'as the candidate list size increases, performance improves' (Section 6.3). These are supported by controlled single-variable manipulations in Sections 6.1-6.3." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims 'LLM Alignment' generally, but experiments cover only 3 open-source models (Mistral-7b, Gemma2-2b, Mathstral-7b) at ≤7B parameters, tested on 2 conversational benchmarks and 1 math benchmark. No larger models, no proprietary models, no non-English evaluation. The abstract says 'a new alignment method that enhances overall alignment quality' without bounding to tested settings." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No discussion of alternative explanations for why LARPO outperforms baselines. The improvements could partly be due to the 5x more generated responses (10 vs 2 for iterative DPO), longer training, or reward model quality rather than the IR-inspired objectives. None of these confounds are considered." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper measures AlpacaEval2 win rate (LLM-judge preference) and MixEval scores, and frames these as measuring 'alignment quality.' No discussion of whether LLM-judge preferences actually measure alignment, or what alignment means beyond benchmark performance." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model identifiers are provided: 'Gemma2-2b-it' (Team et al., 2024b), 'Mistral-7b-it' (Jiang et al., 2023a), 'Mathstral-7b-it' (Mistral AI, 2025), 'Mistral-7b-base'. These are specific open-source checkpoints identifiable from their papers. The reward models LLM-Blender and FsfairX are also specified with citations." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "The paper references public datasets (UltraFeedback, AlpacaEval2, MixEval) as prompt sources but does not provide actual prompt text used in training or evaluation. No example prompts are shown." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Appendix H provides detailed hyperparameters: learning rate (5e-7), temperature search range (0.8-1.2), number of responses per prompt (10), number of iterations (3), epochs per iteration (2), and specific configurations for each LARPO variant and baseline." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. LARPO is a training method for preference optimization, not an agent-based system." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The data construction pipeline is described: Algorithm 1 details how responses are generated, scored by the reward model, ranked, and assembled into preference data. Appendix H specifies how positive/negative examples are selected (e.g., 'top-1 ranked response and bottom-3 ranked responses' for contrastive LARPO)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no dedicated limitations section. The Impact Statement (Section 8 area) says 'we do not believe any specific impacts warrant explicit discussion,' which dismisses rather than engages with limitations." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No threats to validity are discussed. There is no consideration of specific methodological weaknesses such as single-seed results, small model sizes, limited benchmark coverage, or the compute asymmetry between LARPO and baselines." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit statements about what the results do not show. The paper does not bound its claims to the tested model sizes (≤7B), model families (Mistral, Gemma), or evaluation settings." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "The generated preference data (responses, reward scores, preference pairs) used to train LARPO is not released. While the source datasets are public, the intermediate training data that would allow verification of the pipeline is not available." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Algorithm 1 and Section 4 describe the data generation process: prompts sampled from UltraFeedback, responses generated by the policy LLM at specified temperatures, scored by the reward model, and ranked to form preference data." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. All data comes from standard public benchmarks and LLM-generated responses." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "Algorithm 1 documents the full pipeline: prompt sampling → response generation with temperature → reward model scoring → ranking → preference data construction → iterative training. Appendix H adds specific details for each experimental configuration." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "The Acknowledgements section lists funding: Apple PhD Fellowship, DARPA INCAS (HR0011-21-C0165), DARPA BRIES (HR0011-24-3-0325), ONR (N000142412612), multiple NSF grants, Cisco, and the Center for Intelligent Information Retrieval." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: University of Illinois at Urbana-Champaign, Google Cloud AI Research, Google DeepMind, University of Virginia." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Authors from Google Cloud AI Research and Google DeepMind evaluate results on Google's Gemma2 model. Google has a commercial interest in demonstrating that alignment methods work well with their models. The paper does not acknowledge this potential conflict." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is provided. Multiple authors are employed by Google, which has commercial interests in LLM alignment methods." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for Mistral-7b, Gemma2-2b, or Mathstral-7b. The reader cannot assess whether benchmark data was in the pretraining corpus." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether UltraFeedback training prompts overlap with AlpacaEval2 or MixEval evaluation data, or whether the base models' pretraining data includes benchmark content." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "GSM8K (2021) and AlpacaEval2 questions could be in the training data of models released in 2024. No contamination analysis is performed or discussed." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, latency, or tokens consumed are reported. LARPO generates 10 responses per prompt across 3 iterations, which is substantial compute, but no cost figures are provided." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No GPU hours, total training time, or hardware specifications are reported. The paper describes multi-iteration training with response generation and reward model scoring but does not quantify the total compute budget." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No results across multiple random seeds are reported. All results appear to be single-run numbers." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is never stated. There is no 'averaged over K runs' or similar statement." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "Temperature is searched among {0.8, 0.9, 1.0, 1.1, 1.2} (Appendix H.1) and learning rate among {1e-7, 2e-7, 5e-7, 2e-8, 5e-8} (Appendix H.4), but total compute spent on the search is not reported." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "Appendix H.4 mentions 'hyperparameter tuning and early stopping to find the best model checkpoints' but does not specify whether selection was performed on a validation set separate from the test benchmarks." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Many comparisons are made across methods, models, and benchmarks without any statistical tests, let alone corrections for multiple comparisons." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "Offline baseline checkpoints are from Meng et al. (2024b), which is good practice. However, the iterative DPO baseline and LARPO are both implemented by the authors, and the 5x difference in generated responses per prompt (2 vs 10) is not acknowledged as a potential source of bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "LARPO generates 10 responses per prompt while iterative DPO generates only 2, representing roughly 5x more inference compute for data generation. This compute asymmetry is never discussed or controlled for in comparisons." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether AlpacaEval2 (LLM-judge preference) or MixEval actually measure 'alignment quality' as claimed. The paper takes benchmark scores at face value without questioning construct validity." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is used. LARPO is a training methodology, not a scaffolded system." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the base models (Mistral-7b, Gemma2-2b) were trained on data that includes benchmark solutions. GSM8K was published in 2021, well before these models' training data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether UltraFeedback training data overlaps with or leaks information about evaluation benchmarks." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of independence between UltraFeedback training data and evaluation benchmark data (AlpacaEval2, MixEval)." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection methods (canary strings, n-gram overlap, membership inference, decontamination) are applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "LARPO achieves 38.9% and 13.7% averaged relative improvement over baselines on AlpacaEval2 and MixEval-Hard respectively.", 374 "evidence": "Table 2 shows LARPO variants outperforming all baselines across both Mistral-Base and Mistral-Instruct settings. E.g., LARPO (LambdaRank) achieves 34.9 LC WR on Mistral-Base vs SimPO's 21.5, and 32.9 on Mistral-Instruct vs SimPO's 32.1.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Contrastive optimization generally outperforms pairwise optimization (DPO) for LLM alignment.", 379 "evidence": "Table 3 shows contrastive > pairwise on AlpacaEval2 for both Gemma2-2b-it (43.41 vs 41.39 LC WR) and Mistral-7b-it (38.44 vs 36.43 LC WR), and on most MixEval metrics.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Listwise optimization methods (ListMLE, LambdaRank) demonstrate superior performance compared to pairwise and contrastive approaches.", 384 "evidence": "Table 3 shows listwise methods generally outperform others, e.g., ListMLE achieves 49.77 LC WR on Gemma2-2b-it vs 41.39 pairwise and 43.41 contrastive. LambdaRank achieves 40.29 on Mistral-7b-it vs 36.43 pairwise.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Harder negatives lead to more performant LLMs during alignment training.", 389 "evidence": "Figure 4(a) shows a monotonic improvement from easiest to hardest negatives on GSM8K with Mathstral-7b-it across 3 iterations. Figure 4(b) shows that within a range, lower temperatures (harder negatives) improve AlpacaEval2 win rate.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Larger candidate list size improves alignment performance with diminishing returns.", 394 "evidence": "Figure 4(c) shows LC win rate and win rate increasing from 4 to 10 candidate responses on Mistral-7b-it with contrastive objective, with the rate of improvement decreasing.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Incorporating responses from previous iterations enhances preference optimization performance.", 399 "evidence": "Table 4 shows progressive improvement: current-only (55.06 LC WR) → current+prev (55.62) → current+all prev (56.02) on Gemma2-2b-it. Improvements are consistent but small.", 400 "supported": "weak" 401 }, 402 { 403 "claim": "A stronger reward model further improves LARPO by 25.8% on AlpacaEval2.", 404 "evidence": "Table 2 shows LARPO with FsfairX reward model achieving up to 43.0 LC WR on Mistral-Instruct vs 32.9 with LLM-Blender on the same LARPO variant.", 405 "supported": "moderate" 406 } 407 ], 408 "red_flags": [ 409 { 410 "flag": "No error bars or variance across seeds", 411 "detail": "All results in Tables 2-4 and Table 6 are single-run point estimates. Given that RL-based alignment can be sensitive to random seeds (as noted by Henderson et al. 2018 for RL methods), the absence of any uncertainty quantification makes it impossible to assess whether observed differences are reliable or within noise." 412 }, 413 { 414 "flag": "Unfair compute comparison with baselines", 415 "detail": "LARPO generates 10 responses per prompt per iteration while Iterative DPO generates only 2, representing roughly 5x more inference compute for training data generation. The paper does not control for or acknowledge this asymmetry, making it unclear how much improvement comes from the IR-inspired objectives vs simply having more training signal." 416 }, 417 { 418 "flag": "No limitations section", 419 "detail": "The paper has no dedicated limitations discussion. The Impact Statement dismisses the need for one. Key limitations such as evaluation on only small models (≤7B), limited benchmarks, single-seed results, and compute asymmetry are unacknowledged." 420 }, 421 { 422 "flag": "Opaque averaging for headline improvement numbers", 423 "detail": "The abstract claims '38.9% and 13.7% averaged improvement' but does not specify how this average was computed (which baselines, which LARPO variants, which base models were averaged). This makes the headline claim difficult to verify." 424 }, 425 { 426 "flag": "Google-affiliated authors evaluate on Google's Gemma2 model", 427 "detail": "Authors from Google Cloud AI Research and Google DeepMind include results on Google's Gemma2-2b-it model without acknowledging the potential conflict of interest." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "Direct preference optimization: Your language model is secretly a reward model", 433 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D. Manning", "Stefano Ermon", "Chelsea Finn"], 434 "year": 2024, 435 "relevance": "Foundational direct alignment method (DPO) that LARPO builds upon and compares against as a key baseline." 436 }, 437 { 438 "title": "Training language models to follow instructions with human feedback", 439 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 440 "year": 2022, 441 "arxiv_id": "2203.02155", 442 "relevance": "Seminal RLHF paper (InstructGPT) establishing the PPO-based alignment paradigm that direct optimization methods aim to simplify." 443 }, 444 { 445 "title": "SimPO: Simple preference optimization with a reference-free reward", 446 "authors": ["Yu Meng", "Mengzhou Xia", "Danqi Chen"], 447 "year": 2024, 448 "arxiv_id": "2405.14734", 449 "relevance": "Recent direct alignment method used as a primary baseline; also the source of baseline checkpoints used in experiments." 450 }, 451 { 452 "title": "Iterative preference learning from human feedback: Bridging theory and practice for RLHF under KL-constraint", 453 "authors": ["Wei Xiong", "Hanze Dong", "Chenlu Ye"], 454 "year": 2024, 455 "relevance": "Establishes the iterative preference optimization framework that LARPO extends with IR-inspired components." 456 }, 457 { 458 "title": "RLHF workflow: From reward modeling to online RLHF", 459 "authors": ["Hanze Dong", "Wei Xiong", "Bo Pang"], 460 "year": 2024, 461 "arxiv_id": "2405.07863", 462 "relevance": "Provides the FsfairX reward model and iterative training methodology used in LARPO's experiments." 463 }, 464 { 465 "title": "LiPO: Listwise preference optimization through learning-to-rank", 466 "authors": ["Tianqi Liu", "Zhen Qin", "Junru Wu"], 467 "year": 2024, 468 "arxiv_id": "2402.01878", 469 "relevance": "Most closely related prior work applying learning-to-rank objectives to LLM alignment, though it relies on off-the-shelf listwise preference data." 470 }, 471 { 472 "title": "KTO: Model alignment as prospect theoretic optimization", 473 "authors": ["Kawin Ethayarajh", "Winnie Xu", "Niklas Muennighoff", "Dan Jurafsky", "Douwe Kiela"], 474 "year": 2024, 475 "arxiv_id": "2402.01306", 476 "relevance": "Alternative direct alignment method based on prospect theory, used as a baseline in experiments." 477 }, 478 { 479 "title": "GPT-4 technical report", 480 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 481 "year": 2023, 482 "arxiv_id": "2303.08774", 483 "relevance": "Foundational LLM whose alignment approaches motivate the research direction of this paper." 484 }, 485 { 486 "title": "A general theoretical paradigm to understand learning from human preferences", 487 "authors": ["Mohammad Gheshlaghi Azar", "Zhaohan Daniel Guo", "Bilal Piot", "Rémi Munos"], 488 "year": 2024, 489 "relevance": "IPO method providing theoretical grounding for pairwise preference optimization, used as a baseline." 490 }, 491 { 492 "title": "Contrastive preference optimization: Pushing the boundaries of LLM performance in machine translation", 493 "authors": ["Haoran Xu", "Amr Sharaf", "Yunmo Chen"], 494 "year": 2024, 495 "arxiv_id": "2401.08417", 496 "relevance": "CPO alignment method combining contrastive objectives with SFT, used as a baseline comparison." 497 }, 498 { 499 "title": "UltraFeedback: Boosting language models with scaled AI feedback", 500 "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"], 501 "year": 2024, 502 "relevance": "Primary training dataset used for all LARPO and baseline preference optimization experiments." 503 }, 504 { 505 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 506 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 507 "year": 2022, 508 "arxiv_id": "2204.05862", 509 "relevance": "Foundational RLHF work from Anthropic establishing helpful/harmless alignment objectives." 510 } 511 ], 512 "engagement_factors": { 513 "practical_relevance": { 514 "score": 2, 515 "justification": "LARPO is a concrete alignment training method that practitioners could implement, but requires significant ML infrastructure and no code is released." 516 }, 517 "surprise_contrarian": { 518 "score": 1, 519 "justification": "The IR-to-alignment mapping is a novel framing, but the underlying techniques (contrastive/listwise ranking) are well-established and the direction is not contrarian." 520 }, 521 "fear_safety": { 522 "score": 0, 523 "justification": "No safety or security concerns raised; the paper focuses on improving alignment quality metrics on standard benchmarks." 524 }, 525 "drama_conflict": { 526 "score": 0, 527 "justification": "No controversy or conflict angle; a standard methods paper proposing incremental improvements." 528 }, 529 "demo_ability": { 530 "score": 0, 531 "justification": "No code, demo, or pretrained model released. Cannot be tried without reimplementation." 532 }, 533 "brand_recognition": { 534 "score": 1, 535 "justification": "Authors from Google Cloud AI Research and Google DeepMind, published at ICML, but the paper itself is not about a major product." 536 } 537 } 538 }