scan-v5.json (25444B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Getting More Juice Out of the SFT Data: Reward Learning from Human Demonstration Improves SFT for LLM Alignment", 6 "authors": [ 7 "Jiaxiang Li", 8 "Siliang Zeng", 9 "Hoi-To Wai", 10 "Chenliang Li", 11 "Alfredo García", 12 "Mingyi Hong" 13 ], 14 "year": 2024, 15 "venue": "Neural Information Processing Systems", 16 "arxiv_id": "2405.17888", 17 "doi": "10.48550/arXiv.2405.17888" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims about convergence guarantees, code availability, and performance improvement over SFT are all substantiated by Theorem 3.1, the GitHub link, and Tables 3–4. The claim of robustness to low-quality data is more theoretical than empirically demonstrated.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Causal claims ('reward learning improves SFT') are supported by controlled comparisons holding constant the base model and dataset while varying only the training algorithm, which is adequate for this class of claim in ML.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "The conclusion 'it is beneficial to leverage reward learning throughout the entire alignment process' is stated broadly, but experiments cover only two model families (pythia and zephyr) and two datasets; no explicit scope qualification is given in the conclusion.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper attributes improvements entirely to the IRL/reward-learning framework without considering that the improvement may stem from the contrastive training objective alone rather than the IRL interpretation; no alternative mechanisms are evaluated.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper claims 'alignment' improvement but measures downstream benchmark scores (ARC, MMLU, GSM8k, etc.) that are far removed from human preference alignment; no discussion of this proxy gap is provided.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "Limitations appear only as two sentences within the Conclusions section ('Our theory only indicate the convergence to stationary point… The additional computation resources… are not negligible'), not as a dedicated section.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "The only threats mentioned are convergence to stationary vs. global optima and compute cost; no threats related to dataset scope, hyperparameter sensitivity, or benchmark-alignment validity are discussed.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper notes focus on demonstration (not preference) datasets but does not explicitly state what the results do not show (e.g., larger models, other domains, non-English tasks).", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "NSF grants (EPCN-2311007, ECCS-2426064, CCF-1910385, ECCS-2240789) and Minnesota Supercomputing Institute support are disclosed in the Acknowledgements.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All author affiliations (University of Minnesota, Chinese University of Hong Kong, Texas A&M University) are clearly listed on the title page.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "All funding is from NSF (government agency), which has no financial stake in the outcome of LLM alignment research.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests or financial interests statement (patents, equity, consulting) is present in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "SFT, RLHF, IRL, reward model, demonstration data, and preference data are all explicitly defined with mathematical formulations in Section 2 before use.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The contribution bullet points in Section 1 clearly state: a new bilevel IRL formulation, two algorithms (RFT and IRFT), a connection to SPIN, and convergence guarantees.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper situates itself relative to SFT, RLHF, DPO, SPIN, and MaxEnt-IRL in both the main text and Appendix A, explicitly showing how IRFT subsumes SPIN and extends IRL to LLM alignment.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "Code is available at https://github.com/JasonJiaxiangLi/Reward_learning_SFT as stated in the abstract.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "All datasets used (Anthropic-HH, Ultrachat200k) are publicly available standard benchmarks not created by the authors.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Appendix C lists GPUs (NVIDIA A100-40G), precision (bfloat16), and libraries (DeepSpeed ZeRO-3, FlashAttention-2, TRL, lm-eval v0.4.2) but provides no requirements.txt, Dockerfile, or environment lock file.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "Appendix C provides hyperparameters and hardware details but no step-by-step commands or scripts to reproduce specific tables; the NeurIPS checklist claims reproducibility but no runnable instructions are included in the paper.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "The NeurIPS checklist claims error bars are included for plots, and Figure 2 shows variance bands on training curves, but the main comparison Tables 3, 4, 6, 7, and 8 report point estimates with no uncertainty.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are performed for any of the comparative claims; improvements of 0.09–1.55% average are reported as improvements without significance assessment.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Absolute percentage improvements on each leaderboard task and averages are reported in Tables 3–4 with baseline context, enabling effect size interpretation.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The choice of 10k samples for RFT and 50k for IRFT follows SPIN's strategy without justification for adequacy or power considerations.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "Variance bands appear in Figure 2 training curves, but the primary results tables (3, 4, 6, 8) report no standard deviation or variance across runs.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "The pretrained base model, standard SFT, and SPIN (the closest prior work) are all included as explicit baselines in Tables 3 and 4.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "SPIN (Chen et al., 2024) is a concurrent/recent method; SFT is the appropriate fundamental baseline for this work.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Different values of T (1, 2, 5, 8, 10, 16) and K are tested in Tables 3 and 4, ablating the generation frequency parameter central to the method.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Six downstream tasks (ARC, TruthfulQA, Winogrande, GSM8k, HellaSwag, MMLU) plus reward model scores and win rate are used across experiments.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": false, 207 "justification": "No human judges evaluate system outputs; the paper claims alignment improvement but relies entirely on automated benchmarks and an automated reward model (PKU beaver-7b).", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "HuggingFace Open LLM Leaderboard tasks use held-out test sets; for Anthropic-HH, a separate test split is used for reward evaluation.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Tables 3 and 4 report per-task performance across all six leaderboard tasks separately, not just averaged scores.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No systematic analysis of when the method fails or underperforms is provided; the section notes 1b models are 'not strong enough for GSM8k' but does not analyze failure modes.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Some IRFT configurations underperform SPIN (e.g., T=8, K=8 scores 59.85 vs SPIN iter2's 61.02 in Table 4), and these are reported without suppression.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Specific model identifiers are given: pythia-1.4b (Biderman et al., 2023), zephyr-7b-sft-full (Tunstall et al., 2023), and the evaluator PKU-Alignment/beaver-7b-v3.0-reward (Dai et al., 2024).", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "The exact prompt template '### Instruction: prompt\\n\\n### Response: ' is provided in Appendix C.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Learning rates (5e-7, 1e-7), β=0.1, max sequence lengths (1024/2048), batch sizes per device, optimizer (RMSProp), precision (bfloat16), and T/K values are all reported.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "This is a training methodology paper with no agentic scaffolding; evaluation uses standard LM harness.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Section 5.1 and Appendix C describe selecting top-10k Anthropic-HH examples by reward score and 50k Ultrachat200k following SPIN's strategy, with clear rationale.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "All training data (Anthropic-HH, Ultrachat200k) is publicly available on HuggingFace; the data selection procedure is described and reproducible.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Data collection is described: Anthropic-HH top-10k selected by PKU reward model score; Ultrachat200k subset following SPIN strategy.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "Standard public benchmarks are used with no participant recruitment.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The pipeline from raw public datasets through selection, training, and evaluation is described across Section 5.1 and Appendix C.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "The training data cutoffs for pythia and zephyr base models are not stated; evaluation benchmarks like MMLU and ARC predate these models, raising contamination concerns not addressed.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": false, 303 "justification": "No discussion of whether Anthropic-HH or Ultrachat200k training data overlaps with the HuggingFace Open LLM Leaderboard evaluation tasks.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "The Open LLM Leaderboard benchmarks (ARC, TruthfulQA, etc.) were publicly available before the training data cutoff of zephyr/pythia; contamination is not discussed.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "Inference/generation time is discussed qualitatively ('generation is time-consuming') in Section 4 but no quantitative latency or cost figures are provided.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "GPU type and count are stated (2×A100-40G for 1b, 8×A100-40G for 7b) but total training time, number of GPU-hours, or compute cost are not reported.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Reward learning from demonstration data significantly improves LLM alignment over standard SFT", 376 "evidence": "Tables 3 and 4 show IRFT T=10 improves zephyr-7b average from 59.48% to 61.03% on Open LLM Leaderboard; Figure 2 shows higher reward scores and win rates for RFT over SFT on Anthropic-HH", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "IRL-based algorithms converge to stationary solutions of the proposed bilevel formulation", 381 "evidence": "Theorem 3.1 proves convergence rate O(1/√TK) under Assumption B.1; full proof provided in Appendix B", 382 "supported": "strong" 383 }, 384 { 385 "claim": "IRFT with T=1 is equivalent to SPIN (Chen et al., 2024)", 386 "evidence": "Lemma 3.2 and the discussion in Section 4 analytically show the gradient estimator reduces to SPIN when T=1, deriving SPIN from the IRL framework", 387 "supported": "strong" 388 }, 389 { 390 "claim": "IRL-based methods can distinguish preferred from non-preferred continuations despite being trained only on preferred data", 391 "evidence": "Figure 1 (right) shows RFT and IRFT produce positive log-probability gaps between chosen/rejected continuations while SFT assigns higher probability to non-preferred responses", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "More frequent generation (higher T) generally improves performance but with diminishing returns", 396 "evidence": "Table 3 shows IRFT T=10 (37.69%) outperforms T=5 (37.36%) and T=1/SPIN (37.35%) but T=16 (37.57%) shows non-monotonic behavior; Table 4 shows T=10 best overall", 397 "supported": "weak" 398 }, 399 { 400 "claim": "Reward model learned from demonstration data alone has strong capability to distinguish chosen vs rejected responses", 401 "evidence": "Table 8 shows IRFT T=5 achieves 55.6% win-rate on UltraFeedback preference dataset (not in training), compared to 42.6% for SFT and 42.8% for SPIN", 402 "supported": "moderate" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval", 407 "theoretical" 408 ], 409 "key_findings": "The paper proposes two IRL-based algorithms (RFT and IRFT) that extract reward signals from demonstration-only data to improve SFT for LLM alignment. Theoretically, both algorithms are proven to converge to stationary points at rate O(1/√TK), and IRFT is shown to subsume SPIN as a special case. Empirically, IRFT with optimal T (around 5–10) consistently outperforms vanilla SFT and SPIN on the HuggingFace Open LLM Leaderboard, with zephyr-7b average improving from 59.48% to 61.03%. The implicit reward learned purely from demonstration data shows meaningful generalization to preference discrimination on held-out datasets (55.6% win-rate vs. 42.6% for SFT).", 410 "red_flags": [ 411 { 412 "flag": "No error bars on main tables", 413 "detail": "Tables 3, 4, 6, 7, and 8 report point estimates without standard deviation or confidence intervals; improvements of 0.09–1.55% are claimed significant without statistical tests." 414 }, 415 { 416 "flag": "Alignment proxy gap", 417 "detail": "Claims of 'alignment improvement' are measured via NLP benchmarks (ARC, MMLU, GSM8k) that test general knowledge and reasoning, not alignment to human preferences; the connection is asserted but not validated." 418 }, 419 { 420 "flag": "Baseline version confound", 421 "detail": "Appendix D reveals the zephyr-7b base model used differs from the SPIN paper's baseline (GSM8k 34.19% vs 26.23%), making direct comparison with SPIN's reported improvements misleading." 422 }, 423 { 424 "flag": "Limited model diversity", 425 "detail": "Experiments cover only two model families (pythia ≤1.4b and zephyr-7b); no larger models tested despite the conclusion being framed as general." 426 }, 427 { 428 "flag": "No statistical significance testing", 429 "detail": "Small gains (e.g., 37.27% → 37.69% average for pythia) are presented as improvements without any test of whether they exceed noise." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models", 435 "relevance": "SPIN is the closest prior work; the paper shows IRFT subsumes SPIN and provides a theoretical foundation for it via IRL." 436 }, 437 { 438 "title": "Training Language Models to Follow Instructions with Human Feedback", 439 "relevance": "InstructGPT (Ouyang et al., 2022) establishes RLHF as the baseline alignment paradigm this work seeks to improve." 440 }, 441 { 442 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 443 "relevance": "DPO is the key implicit-reward comparison method; the paper draws analogies between RFT:IRFT and RLHF:DPO." 444 }, 445 { 446 "title": "Maximum-Likelihood Inverse Reinforcement Learning with Finite-Time Guarantees", 447 "relevance": "Zeng et al. (2022) provides the IRL theoretical framework (ML-IRL) that this paper adapts to LLM alignment." 448 }, 449 { 450 "title": "Zephyr: Direct Distillation of LM Alignment", 451 "relevance": "Zephyr-7b-sft-full is the primary 7b base model used in experiments." 452 }, 453 { 454 "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling", 455 "relevance": "Pythia models are the primary small-scale experimental subjects." 456 }, 457 { 458 "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback", 459 "relevance": "Anthropic-HH is the preference/demonstration dataset used for RFT experiments." 460 }, 461 { 462 "title": "Safe RLHF: Safe Reinforcement Learning from Human Feedback", 463 "relevance": "PKU beaver-7b reward model is used as the evaluation reward model for Anthropic-HH experiments." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "Practitioners can apply IRFT to their SFT pipelines with similar compute cost to SPIN; code is released and the method is straightforward to implement." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "The claim that reward learning is useful even without preference data, and that standard SFT may be suboptimal, challenges the conventional two-stage alignment pipeline." 474 }, 475 "fear_safety": { 476 "score": 0, 477 "justification": "No safety or AI risk angle; the paper is a training methodology improvement with no adversarial or misuse dimension." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "The paper reinterprets SPIN as a special case of IRL, which could be seen as a mild priority dispute, but the framing is collaborative rather than adversarial." 482 }, 483 "demo_ability": { 484 "score": 2, 485 "justification": "Code is released on GitHub; practitioners with A100 access can run the experiments, though the compute requirement (8×A100 for 7b) limits casual reproduction." 486 }, 487 "brand_recognition": { 488 "score": 1, 489 "justification": "University of Minnesota and Texas A&M are respected but not top-tier name-recognition labs; NeurIPS venue adds some credibility signal." 490 } 491 }, 492 "hn_data": { 493 "threads": [ 494 { 495 "hn_id": "39837741", 496 "title": "The Unreasonable Ineffectiveness of the Deeper Layers", 497 "points": 4, 498 "comments": 0, 499 "url": "https://news.ycombinator.com/item?id=39837741" 500 }, 501 { 502 "hn_id": "40832702", 503 "title": "Leapfrogging Sycamore: 1432 GPUs for 7× Faster Quantum Random Circuit Sampling", 504 "points": 3, 505 "comments": 0, 506 "url": "https://news.ycombinator.com/item?id=40832702" 507 }, 508 { 509 "hn_id": "42330669", 510 "title": "Compressing Large Language Models Using Low Rank and Low Precision Decomposition", 511 "points": 2, 512 "comments": 1, 513 "url": "https://news.ycombinator.com/item?id=42330669" 514 }, 515 { 516 "hn_id": "39903302", 517 "title": "The Unreasonable Ineffectiveness of the Deeper Layers", 518 "points": 2, 519 "comments": 0, 520 "url": "https://news.ycombinator.com/item?id=39903302" 521 } 522 ], 523 "top_points": 4, 524 "total_points": 11, 525 "total_comments": 1 526 } 527 }