scan-v5.json (25043B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLM Alignment as Retriever Optimization: An Information Retrieval Perspective", 6 "authors": [ 7 "Bowen Jin", 8 "Jinsung Yoon", 9 "Zhen Qin", 10 "Ziqi Wang", 11 "Wei Xiong" 12 ], 13 "year": 2025, 14 "venue": "International Conference on Machine Learning", 15 "arxiv_id": "2502.03699", 16 "doi": "10.48550/arXiv.2502.03699" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The 38.9% and 13.7% averaged relative improvements on AlpacaEval2 and MixEval-Hard are supported by Table 2 results (e.g., LARPO LambdaRank 34.9% vs SimPO 21.5% LC WR on Mistral-Base).", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims about hard negatives, list size, and optimization objective are each tested in controlled ablation studies (Figure 4a/b/c, Table 3, Table 4) with individual variables manipulated.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper makes broad claims about 'LLM alignment' in the title and conclusions but only tests 7B-class models on two benchmarks (AlpacaEval2, MixEval); no scope limitations on model scale, domain, or safety alignment are stated.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "LARPO uses 10 candidate responses vs. 2 for DPO baselines, confounding data quantity with loss function design; the paper does not discuss whether improvements stem from more responses rather than the IR-inspired objectives.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper claims to improve 'alignment quality' broadly but measures only instruction-following win rates on AlpacaEval2 (LLM-judged) and MixEval; the gap between these proxies and actual alignment (safety, truthfulness, etc.) is not discussed.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no limitations section; the Impact Statement explicitly states 'we do not believe any specific impacts warrant explicit discussion,' and no threats-to-validity section exists.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No threats to validity are discussed anywhere in the paper, including the confound of different candidate list sizes between LARPO and baselines.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper never explicitly states what the results do not show (e.g., no discussion of applicability to larger models, safety alignment, or non-instruction-following tasks).", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgments disclose funding from Apple PhD Fellowship, DARPA, ONR, NSF, Cisco, and Center for Intelligent Information Retrieval.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly disclosed: UIUC, Google Cloud AI Research, Google DeepMind, and University of Virginia.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Funders (NSF, DARPA, ONR, Cisco) are independent of the alignment benchmark outcomes; Google-affiliated authors evaluate primarily Mistral models, not Google products.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present beyond the funding acknowledgments.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "The IR-alignment analogy (LLM as retriever, reward model as reranker) is precisely defined in Section 2, and LARPO's objectives are formally stated in Table 1 with proofs in Appendix F.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four explicit contributions are listed in the introduction: the IR-alignment framework, significance of three IR principles, the LARPO method, and empirical IR-metric analysis of LLMs.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 7 relates LARPO to DPO, SimPO, LiPO, and iterative DPO, explaining how this work differs from and builds on each; the most related work (LiPO) is specifically distinguished.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code repository or release is mentioned anywhere in the paper.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All evaluation benchmarks (AlpacaEval2, MixEval, GSM8K, MATH) and training data (Ultrafeedback) are standard publicly available datasets.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or equivalent dependency specification is provided; hardware and software environment are not described.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Appendices H provide hyperparameters but no step-by-step reproduction instructions; critical details like GPU type, training time, and data preprocessing code are absent.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "No confidence intervals or error bars are reported for any result across Tables 2-4 or Figures 4-6.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied despite comparative claims against multiple baselines.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Relative improvements are quantified (38.9% on AlpacaEval2, 13.7% on MixEval-Hard) with baseline values provided for context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The choice of benchmark sizes (805 AlpacaEval2 questions, 1000 MixEval-Hard) is not discussed in terms of statistical power or justification.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance, standard deviation, or results across multiple runs are reported for any experimental condition.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Eight offline baselines (RRHF, SLiC-HF, DPO, IPO, CPO, KTO, RDPO, SimPO) and one online baseline (Iterative DPO) are included in Table 2.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include SimPO (2024), Iterative DPO (2024), and KTO (2024), which are current state-of-the-art direct preference optimization methods.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Separate ablations study the optimization objective (Table 3), hard negative hardness (Figure 4a/b), candidate list size (Figure 4c), and memorization (Table 4).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Multiple metrics are reported: LC Win Rate and raw Win Rate on AlpacaEval2, and Score on both MixEval and MixEval-Hard.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "Evaluation uses LLM-as-judge (AlpacaEval2 with GPT-4 or Llama-3-70B, MixEval automated scoring); no human evaluation of system outputs is conducted.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "AlpacaEval2 and MixEval serve as held-out test sets; training is on Ultrafeedback, and GSM8K/MATH use standard train/test splits.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": false, 218 "justification": "No per-category or per-task breakdown of results is provided; metrics are reported only as aggregate scores across all benchmark questions.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "No failure cases or error analysis is presented; the paper only reports aggregate performance improvements.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Figure 4b shows that very low temperature for negative generation degrades performance, and Appendix H.6 notes that responses become near-identical below a threshold, constituting a reported negative finding.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Specific model identifiers are given: Mistral-7b-base, Mistral-7b-it, Gemma2-2b-it, Mathstral-7b-it, with references to the original model papers.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "No prompts, system instructions, or prompt templates used during training or evaluation are provided.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Appendices H.1-H.7 report learning rates (e.g., 5e-7), number of iterations (3), number of responses (10), temperature search ranges, and epoch counts for each experiment.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This paper is about training-time alignment methods, not agentic scaffolding; no agentic scaffolding is used.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": false, 262 "justification": "Data preprocessing for Ultrafeedback training is not documented; how prompts are sampled and filtered from the dataset is not described.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "No model checkpoints, generated response sets, or reward model scores are made available for independent verification.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Algorithm 1 and Section 4 describe the iterative data collection process (generate k responses, score with reward model, rank and select for training) in sufficient detail.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants are involved; standard benchmark datasets are used.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Algorithm 1 documents the full pipeline from prompt sampling through response generation, reward scoring, ranking, and model update across iterations.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The training data cutoff of Mistral-7b or Gemma-2b base models is not stated, which matters since AlpacaEval2 and MixEval questions may appear in pretraining data.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "No discussion of potential overlap between base model pretraining data and the evaluation benchmarks (AlpacaEval2, MixEval, GSM8K, MATH).", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "The possibility that GSM8K, MATH, or AlpacaEval2 questions appeared in the base models' pretraining data is not discussed anywhere.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants involved.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants involved.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants involved.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants involved.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants involved.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants involved.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants involved.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference latency or cost is reported despite iterative training with 10 generations per prompt being substantially more expensive than DPO.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No GPU type, GPU hours, or total computational budget is stated for any experiment.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "LARPO achieves 38.9% relative averaged improvement on AlpacaEval2 and 13.7% on MixEval-Hard compared to competitive baselines.", 375 "evidence": "Table 2 shows LARPO (LambdaRank) reaches 34.9% LC WR vs 21.5% for SimPO on Mistral-Base with LLM-Blender reward model; relative improvement calculation over averaged baselines.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Listwise objectives (LambdaRank, ListMLE) outperform pairwise (DPO) and contrastive objectives for LLM alignment.", 380 "evidence": "Table 3 shows LambdaRank (40.29% LC WR) and ListMLE (38.02%) exceed pairwise DPO (36.43%) for Mistral-7b-it; consistent pattern across both models.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Harder negative samples (lower temperature, on-prompt responses) lead to stronger trained LLMs.", 385 "evidence": "Figure 4a shows hardest negatives (temp=0.7, correct prompt) achieve highest accuracy (~0.83) vs. easiest random negatives (~0.75) across 3 iterations on GSM8K.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Larger candidate lists improve alignment performance, with diminishing returns.", 390 "evidence": "Figure 4c shows win rate increases from ~50% (4 responses) to ~62% (10 responses) on AlpacaEval2 with Mistral-7b-it using contrastive objective.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Including responses from all previous iterations in the candidate pool improves alignment over using only the current iteration.", 395 "evidence": "Table 4 shows 'w. current + all prev' achieves 72.50% WR vs. 66.56% for 'w. current only' using Lpair on Gemma2-2b-it.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "The LLM-as-retriever analogy is empirically validated: Pass@N curves for LLMs mirror Recall@N curves for IR retrievers.", 400 "evidence": "Figure 2 shows parallel increasing curves for e5 retriever (Recall@N) and Mathstral-7b-it (Pass@N) on NQ and GSM8K respectively as N increases.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "theoretical" 407 ], 408 "key_findings": "LARPO reframes LLM alignment as iterative retriever optimization by formally mapping LLMs to bi-encoder retrievers and reward models to cross-encoder rerankers, deriving listwise alignment objectives from IR ranking theory. Applied to 7B-class models on AlpacaEval2 and MixEval, LARPO's listwise objectives consistently outperform DPO and SimPO, with the LambdaRank variant achieving the strongest results. Ablations confirm three independent contributors: harder negative samples, larger candidate lists, and listwise over pairwise objectives each provide additive gains. The analogy is further empirically validated by showing that Pass@N curves for LLMs mirror Recall@N curves for traditional retrievers.", 409 "red_flags": [ 410 { 411 "flag": "Unfair baseline comparison", 412 "detail": "Offline baseline scores for Table 2 are taken directly from the SimPO paper (Meng et al., 2024b), while LARPO and iterative DPO are evaluated by the authors. LLM-as-judge scores on AlpacaEval2 are not stable across time and evaluator versions, making cross-paper score comparisons unreliable." 413 }, 414 { 415 "flag": "Data quantity confound", 416 "detail": "LARPO generates 10 candidate responses per prompt while DPO baselines use 2. The ablation in Figure 4c shows more responses improve performance, but the main Table 2 comparison does not control for this, making it impossible to isolate the effect of the IR-inspired loss from the effect of more training signal." 417 }, 418 { 419 "flag": "No variance or significance testing", 420 "detail": "No confidence intervals, error bars, or statistical significance tests are reported for any result, despite iterative training introducing substantial run-to-run variance." 421 }, 422 { 423 "flag": "No code or checkpoint release", 424 "detail": "No implementation code, trained model checkpoints, or generated response sets are released, preventing reproduction of the claimed improvements." 425 }, 426 { 427 "flag": "Compute cost unreported", 428 "detail": "LARPO requires iterative retraining with 10x more generated responses per step than DPO; the substantially higher computational cost is never quantified or discussed." 429 }, 430 { 431 "flag": "No limitations section", 432 "detail": "The paper has no dedicated limitations section; the Impact Statement explicitly declines to discuss any specific societal implications." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 438 "relevance": "Primary baseline and theoretical foundation; LARPO is explicitly framed as an enhancement of DPO's pairwise assumption with listwise IR objectives." 439 }, 440 { 441 "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward", 442 "relevance": "Strongest offline baseline against which LARPO is compared; provides the baseline checkpoint scores used in Table 2." 443 }, 444 { 445 "title": "LiPO: Listwise Preference Optimization through Learning-to-Rank", 446 "relevance": "Most related prior work applying learning-to-rank objectives to LLM alignment; LARPO differentiates itself by online iterative data generation vs. LiPO's off-the-shelf listwise data." 447 }, 448 { 449 "title": "Iterative Preference Learning from Human Feedback: Bridging Theory and Practice for RLHF under KL-Constraint", 450 "relevance": "Online alignment baseline (Iterative DPO) directly compared against LARPO; provides the iterative training framework that LARPO extends." 451 }, 452 { 453 "title": "Optimizing Dense Retrieval Model Training with Hard Negatives", 454 "relevance": "Key IR paper establishing importance of hard negatives for retriever training; motivates LARPO's hard negative strategy for LLM alignment." 455 }, 456 { 457 "title": "RocketQA: An Optimized Training Approach to Dense Passage Retrieval", 458 "relevance": "IR work on candidate list construction and retriever optimization that directly inspires LARPO's inclusiveness and memorization principles." 459 }, 460 { 461 "title": "AlpacaEval: A Simple Way to Debias Automatic Evaluators", 462 "relevance": "Primary evaluation benchmark; length-controlled win rate metric is the main measure of LARPO's performance." 463 }, 464 { 465 "title": "MixEval: Deriving Wisdom of the Crowd from LLM Benchmark Mixtures", 466 "relevance": "Secondary evaluation benchmark used to validate LARPO performance beyond the AlpacaEval2 results." 467 }, 468 { 469 "title": "UltraFeedback: Boosting Language Models with Scaled AI Feedback", 470 "relevance": "Training dataset used for all LARPO and baseline experiments; central to reproducibility." 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "Proposes a concrete training method (LARPO) that practitioners could apply to LLM alignment, but no code release limits immediate adoption." 477 }, 478 "surprise_contrarian": { 479 "score": 2, 480 "justification": "The reframing of LLM alignment as an IR retrieval problem is a genuinely novel perspective that challenges the typical RL/preference-optimization framing." 481 }, 482 "fear_safety": { 483 "score": 1, 484 "justification": "Addresses AI alignment tangentially but focuses on instruction-following quality rather than safety or risk; no safety implications discussed." 485 }, 486 "drama_conflict": { 487 "score": 1, 488 "justification": "Positions itself against DPO's dominance and makes strong performance claims, but the competitive framing is standard for alignment papers." 489 }, 490 "demo_ability": { 491 "score": 0, 492 "justification": "No code, no demo, no interactive system released; results are only reproducible with significant compute and implementation effort." 493 }, 494 "brand_recognition": { 495 "score": 2, 496 "justification": "Authors from Google Cloud AI Research and Google DeepMind, published at ICML 2025, providing significant institutional credibility." 497 } 498 }, 499 "hn_data": { 500 "threads": [ 501 { 502 "hn_id": "43876843", 503 "title": "Stop treating `AGI' as the north-star goal of AI research", 504 "points": 46, 505 "comments": 32, 506 "url": "https://news.ycombinator.com/item?id=43876843" 507 } 508 ], 509 "top_points": 46, 510 "total_points": 46, 511 "total_comments": 32 512 } 513 }