scan.json (30552B)
1 { 2 "paper": { 3 "title": "RLTHF: Targeted Human Feedback for LLM Alignment", 4 "authors": [ 5 "Yifei Xu", 6 "Tusher Chakraborty", 7 "Emre Kıcıman", 8 "Bibek Aryal", 9 "Eduardo Rodrigues", 10 "Srinagesh Sharma", 11 "Roberto Estevao", 12 "Maria Angels de Luis Balaguer", 13 "Jessica Wolk", 14 "Rafael Padilha", 15 "Leonardo Nunes", 16 "Shobana Balakrishnan", 17 "Songwu Lu", 18 "Ranveer Chandra" 19 ], 20 "year": 2025, 21 "venue": "International Conference on Machine Learning", 22 "arxiv_id": "2502.13417", 23 "doi": "10.48550/arXiv.2502.13417" 24 }, 25 "scan_version": 3, 26 "active_modules": ["experimental_rigor", "data_leakage"], 27 "methodology_tags": ["benchmark-eval"], 28 "key_findings": "RLTHF, a human-AI hybrid framework, achieves preference accuracy comparable to fully human-annotated datasets (89.6% vs 91.8% on HH-RLHF, 88.0% vs 89.6% on TL;DR) using only 6-7% of total human annotation effort. Models fine-tuned with DPO on RLTHF-curated data outperform those trained on fully human-annotated data (58.1% vs 55.7% win rate on HH-RLHF). Ablation studies show that both human annotation amplification and back-off sanitization are essential components, and pure self-improvement without human feedback fails to exceed the initial AI baseline.", 29 "checklist": { 30 "artifacts": { 31 "code_released": { 32 "applies": true, 33 "answer": false, 34 "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper, appendix, or footnotes." 35 }, 36 "data_released": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper uses publicly available datasets: HH-RLHF (Bai et al., 2022a) with 161K samples and TL;DR (Völske et al., 2017) with 93K samples filtered by OpenAI (Stiennon et al., 2020). Both are standard public benchmarks." 40 }, 41 "environment_specified": { 42 "applies": true, 43 "answer": false, 44 "justification": "Appendix D.2 mentions '8×A100 NVIDIA GPUs with DeepSpeed' but provides no software versions, no requirements.txt, no Dockerfile, and no library version details beyond model names." 45 }, 46 "reproduction_instructions": { 47 "applies": true, 48 "answer": false, 49 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Algorithm 1 in Appendix B gives pseudocode but not executable instructions." 50 } 51 }, 52 "statistical_methodology": { 53 "confidence_intervals_or_error_bars": { 54 "applies": true, 55 "answer": false, 56 "justification": "All results in Figures 3-5 and Tables 1-2 are reported as single point estimates with no confidence intervals, error bars, or ± notation." 57 }, 58 "significance_tests": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper makes numerous comparative claims (e.g., 'RLTHF outperforms Random', 'RLTHF outperforms Human baseline') but no statistical significance tests are reported anywhere." 62 }, 63 "effect_sizes_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Effect sizes are provided with baseline context throughout: e.g., '74.7% to 89.6% with only 6% human annotations' (Section 4.1.1), '15.9× and 5.3× higher ROI' (Section 4.1.2), and win rates in Table 2 with all baselines shown for comparison." 67 }, 68 "sample_size_justified": { 69 "applies": true, 70 "answer": false, 71 "justification": "No justification for the choice of dataset sizes, shard sizes, or number of iterations. The 1/4 shard is selected as 'optimal' from empirical observation but no power analysis or formal justification is given." 72 }, 73 "variance_reported": { 74 "applies": true, 75 "answer": false, 76 "justification": "No standard deviations, variance across seeds, or spread measures are reported. All results appear to be from single experimental runs." 77 } 78 }, 79 "evaluation_design": { 80 "baselines_included": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three baselines are compared: (1) AI-only labeling (GPT-4o / GPT-4o mini), (2) Random human annotation with equal annotation budget, and (3) full human annotation. Described in Section 4 and Appendix D.3." 84 }, 85 "baselines_contemporary": { 86 "applies": true, 87 "answer": true, 88 "justification": "Baselines use GPT-4o and GPT-4o mini (2024 models), and the comparison against full human annotation is standard. The RLAIF-style baseline is current." 89 }, 90 "ablation_study": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 4.1.4 presents ablation studies: removing human annotations entirely ('No Annotation'), and removing both amplification and back-off mechanisms ('No Ampl./Back-off'). Results in Table 1 show both are necessary." 94 }, 95 "multiple_metrics": { 96 "applies": true, 97 "answer": true, 98 "justification": "Two evaluation metrics are used: RM preference accuracy on test data (Section 4.1) and pairwise win rate on downstream DPO-trained models via AlpacaEval (Section 4.2, Table 2)." 99 }, 100 "human_evaluation": { 101 "applies": true, 102 "answer": false, 103 "justification": "No human evaluation of system outputs is conducted. Win rates are computed using Claude 3.5 Sonnet as an LLM judge (Section 4.2, Appendix E). The 'human annotations' in the paper are for training data labels, not output evaluation." 104 }, 105 "held_out_test_set": { 106 "applies": true, 107 "answer": true, 108 "justification": "Appendix D.1.1 states 'All test samples are completely separated from the training samples throughout the experiments.' Section 4.2 confirms 'Evaluations are conducted on held-out test sets' of 4K samples each." 109 }, 110 "per_category_breakdown": { 111 "applies": true, 112 "answer": true, 113 "justification": "Results are broken down by dataset (HH-RLHF vs TL;DR), by shard size (full, 1/2, 1/4, 1/8), by initial LLM (GPT-4o vs GPT-4o mini), and by iteration number. Hyperparameter effects are shown per-setting in Figure 5." 114 }, 115 "failure_cases_discussed": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 4.1.3 discusses suboptimal configurations: excessive amplification (α=8) leads to overfitting, and excessive down-sampling (1/8 shard) limits accuracy. Table 1 shows the failure of self-improvement alone." 119 }, 120 "negative_results_reported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Table 1 ablation shows 'No Annotation' achieves no improvement (stays at ~75% for HH-RLHF). Section 4.1.3 reports that α=8 and α=1 are suboptimal, and further subdividing annotation batches 'does not provide a meaningful advantage.'" 124 } 125 }, 126 "claims_and_evidence": { 127 "abstract_claims_supported": { 128 "applies": true, 129 "answer": true, 130 "justification": "Abstract claims are supported: 'full-human annotation-level alignment with only 6-7%' matches Figure 3 (89.6% vs 91.8% on HH-RLHF with 6%); 'outperform those trained on fully human-annotated datasets' matches Table 2 (58.1% vs 55.7% on HH-RLHF)." 131 }, 132 "causal_claims_justified": { 133 "applies": true, 134 "answer": true, 135 "justification": "Causal claims like 'amplification and back-off are essential' are supported by controlled ablation in Table 1, which manipulates single variables. The ablation design (removing components one at a time) is adequate for these internal component claims." 136 }, 137 "generalization_bounded": { 138 "applies": true, 139 "answer": true, 140 "justification": "The abstract bounds claims to 'HH-RLHF and TL;DR datasets' specifically. The paper doesn't claim universal applicability, and Section 3.2.5 notes that 'effectiveness depends on factors such as hyperparameter tuning, the original data distribution, and model selection.'" 141 }, 142 "alternative_explanations_discussed": { 143 "applies": true, 144 "answer": false, 145 "justification": "No substantive discussion of alternative explanations for the main results. The paper does not consider whether improvements might be due to data selection effects independent of the reward distribution approach, or whether the label-flipping ground truth procedure biases results in favor of RLTHF." 146 }, 147 "proxy_outcome_distinction": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper measures RM preference accuracy and LLM-judged win rate but frames these as 'alignment' without discussing the gap. Win rate via Claude 3.5 Sonnet is a proxy for human preference; the paper acknowledges self-enhancement bias (Appendix E) but does not discuss whether LLM-judged win rates reliably measure actual alignment with human preferences." 151 } 152 }, 153 "setup_transparency": { 154 "model_versions_specified": { 155 "applies": true, 156 "answer": false, 157 "justification": "The paper uses 'GPT-4o' and 'GPT-4o mini' without API version or snapshot dates. 'Llama-3.1-8B-Instruct' and 'Qwen2.5-3B' are specified by size but without exact checkpoint versions. 'Claude 3.5 Sonnet' is used as judge without a version identifier." 158 }, 159 "prompts_provided": { 160 "applies": true, 161 "answer": true, 162 "justification": "Full prompt text is provided in Appendix A: system prompt (A.1), user prompt for HH-RLHF (A.2), user prompt for TL;DR (A.3), and adapted win-rate prompts in Appendix E (E.1, E.2). These include actual text, not just descriptions." 163 }, 164 "hyperparameters_reported": { 165 "applies": true, 166 "answer": true, 167 "justification": "Appendix D.2 reports detailed hyperparameters: SFT (lr=2e-5, warmup=0.2, batch=32, 4 epochs), RM (lr=1e-4, LoRA rank=32, alpha=64, batch=128, 2 epochs), DPO (lr=1e-6, beta=0.1/0.5, batch=64, 4 epochs). RLTHF-specific configs in Appendix D.4." 168 }, 169 "scaffolding_described": { 170 "applies": false, 171 "answer": false, 172 "justification": "No agentic scaffolding is used. RLTHF is a reward modeling and annotation pipeline, not an agentic system." 173 }, 174 "data_preprocessing_documented": { 175 "applies": true, 176 "answer": true, 177 "justification": "Appendix D.1 describes data preparation including dataset composition. Appendix D.1.2 details the label-flipping procedure with specific percentages (25% for HH-RLHF, 20% for TL;DR) and justification via Table 3. Sharding approach is described in Section 3.2.5." 178 } 179 }, 180 "limitations_and_scope": { 181 "limitations_section_present": { 182 "applies": true, 183 "answer": false, 184 "justification": "There is no dedicated limitations section. The paper has only a brief 'Impact Statement' that says 'There are many potential societal consequences of our work, none which we feel must be specifically highlighted here.' This does not constitute substantive discussion of limitations." 185 }, 186 "threats_to_validity_specific": { 187 "applies": true, 188 "answer": false, 189 "justification": "No specific threats to validity are discussed anywhere in the paper. Appendix D.4 briefly notes hyperparameters 'may underestimate RLTHF's full potential' but this is not a threat-to-validity discussion." 190 }, 191 "scope_boundaries_stated": { 192 "applies": true, 193 "answer": false, 194 "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, tasks, or conditions RLTHF would not work for, or what the results do not show." 195 } 196 }, 197 "data_integrity": { 198 "raw_data_available": { 199 "applies": true, 200 "answer": true, 201 "justification": "The underlying datasets (HH-RLHF and TL;DR) are publicly available. However, the RLTHF-specific intermediate outputs (reward distributions, iteration-level datasets) are not released." 202 }, 203 "data_collection_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "Appendix D.1.1 describes both datasets: HH-RLHF (161K samples, conversation context with preferred/non-preferred responses) and TL;DR (93K samples, Reddit posts with human preference pairs on model summarizations)." 207 }, 208 "recruitment_methods_described": { 209 "applies": false, 210 "answer": false, 211 "justification": "No new human participants were recruited. The 'human annotations' used in experiments are drawn from existing labels in the public HH-RLHF and TL;DR datasets. Data sources are standard benchmarks." 212 }, 213 "data_pipeline_documented": { 214 "applies": true, 215 "answer": true, 216 "justification": "The full pipeline is documented: Figure 1 gives an overview, Algorithm 1 (Appendix B) provides pseudocode, Section 3 describes each stage in detail, and Appendix D describes experimental configuration. The label-flipping transformation is documented with counts (25%/20%)." 217 } 218 }, 219 "conflicts_of_interest": { 220 "funding_disclosed": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding sources or grants are disclosed. The paper notes 'Work done during an internship at Microsoft' but does not include an acknowledgments section listing funding." 224 }, 225 "affiliations_disclosed": { 226 "applies": true, 227 "answer": true, 228 "justification": "All author affiliations are listed: 13 authors at Microsoft, 1 at UCLA (first author, who did the work during a Microsoft internship). The Microsoft affiliation is clearly stated." 229 }, 230 "funder_independent_of_outcome": { 231 "applies": true, 232 "answer": false, 233 "justification": "Microsoft, the employer of all authors, has a direct commercial interest in reducing RLHF annotation costs for its Azure AI and Copilot products. The funder is not independent of the research outcome." 234 }, 235 "financial_interests_declared": { 236 "applies": true, 237 "answer": false, 238 "justification": "No competing interests or financial disclosure statement is included in the paper." 239 } 240 }, 241 "contamination": { 242 "training_cutoff_stated": { 243 "applies": true, 244 "answer": false, 245 "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, GPT-4o mini, Llama-3.1-8B-Instruct, Qwen2.5-3B). HH-RLHF (2022) and TL;DR (2017/2020) predate all these models, raising contamination concerns for the initial AI labeling step." 246 }, 247 "train_test_overlap_discussed": { 248 "applies": true, 249 "answer": false, 250 "justification": "No discussion of whether GPT-4o may have been trained on HH-RLHF or TL;DR data, which could inflate the initial AI labeling accuracy and bias the RLTHF starting point." 251 }, 252 "benchmark_contamination_addressed": { 253 "applies": true, 254 "answer": false, 255 "justification": "HH-RLHF was published in 2022 and TL;DR in 2017/2020, well before the training of GPT-4o, Llama 3.1, and Qwen 2.5. No discussion of whether these benchmarks appeared in any model's training data." 256 } 257 }, 258 "human_studies": { 259 "pre_registered": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study. 'Human annotations' are drawn from existing labels in public datasets." 263 }, 264 "irb_or_ethics_approval": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. The study uses pre-existing public preference datasets." 268 }, 269 "demographics_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants were recruited for this study." 273 }, 274 "inclusion_exclusion_criteria": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants were recruited for this study." 278 }, 279 "randomization_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants were recruited for this study." 283 }, 284 "blinding_described": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants were recruited for this study." 288 }, 289 "attrition_reported": { 290 "applies": false, 291 "answer": false, 292 "justification": "No human participants were recruited for this study." 293 } 294 }, 295 "cost_and_practicality": { 296 "inference_cost_reported": { 297 "applies": true, 298 "answer": true, 299 "justification": "Appendix F provides detailed cost analysis: GPT-4o costs $0.003/sample, GPT-4o mini $0.00018/sample. RM training costs $32.77/hour on 8×A100. Table 5 summarizes total costs for different configurations." 300 }, 301 "compute_budget_stated": { 302 "applies": true, 303 "answer": true, 304 "justification": "Appendix F states RM training and inference takes '<8 hours on the full dataset, and less than 2 hours on the 1/4 subset' per iteration, with 7 iterations. Hardware: 8×A100 80GB node. Total costs in Table 5." 305 } 306 }, 307 "experimental_rigor": { 308 "seed_sensitivity_reported": { 309 "applies": true, 310 "answer": false, 311 "justification": "No mention of multiple random seeds. All results appear to be from single experimental runs with no seed sensitivity analysis." 312 }, 313 "number_of_runs_stated": { 314 "applies": true, 315 "answer": false, 316 "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating how many runs produced them." 317 }, 318 "hyperparameter_search_budget": { 319 "applies": true, 320 "answer": false, 321 "justification": "Section 4.1.3 varies hyperparameters systematically, but Appendix D.4 states configurations are 'chosen based on heuristics and limited empirical observations' with no total search budget reported." 322 }, 323 "best_config_selection_justified": { 324 "applies": true, 325 "answer": false, 326 "justification": "The 1/4 shard is selected as 'optimal' from Figure 3 by visual inspection. RLTHF hyperparameters (α, β schedules) are chosen by heuristics. No formal validation-set-based selection procedure is described." 327 }, 328 "multiple_comparison_correction": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper makes many comparative claims across multiple conditions, datasets, and iterations without performing any statistical tests, let alone corrections for multiple comparisons." 332 }, 333 "self_comparison_bias_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "The authors implement all baselines (AI-only, Random annotation) themselves and compare against their own method. No acknowledgment of author-evaluation bias or independent evaluation." 337 }, 338 "compute_budget_vs_performance": { 339 "applies": true, 340 "answer": false, 341 "justification": "Performance is not reported as a function of compute budget. RLTHF uses iterative RM training (additional compute) but this is not compared against baselines at matched compute levels." 342 }, 343 "benchmark_construct_validity": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether HH-RLHF and TL;DR preference accuracy actually measures alignment quality, or whether these benchmarks are valid proxies for real-world alignment goals." 347 }, 348 "scaffold_confound_addressed": { 349 "applies": false, 350 "answer": false, 351 "justification": "No agentic scaffolding is involved. RLTHF is a data curation and reward modeling pipeline." 352 } 353 }, 354 "data_leakage": { 355 "temporal_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "HH-RLHF (2022) and TL;DR (2017/2020) were published well before the training of GPT-4o, Llama 3.1, and Qwen 2.5. No discussion of whether models may have seen these preference labels during pre-training, which could inflate initial AI labeling performance." 359 }, 360 "feature_leakage_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of whether the evaluation setup leaks answer information. The reward model is trained and evaluated on the same dataset distribution without addressing potential information leakage." 364 }, 365 "non_independence_addressed": { 366 "applies": true, 367 "answer": false, 368 "justification": "While 'all test samples are completely separated from the training samples' (Appendix D.1.1), there is no discussion of structural similarities between train and test splits (e.g., same Reddit threads, similar conversation patterns)." 369 }, 370 "leakage_detection_method": { 371 "applies": true, 372 "answer": false, 373 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination procedures are used." 374 } 375 } 376 }, 377 "claims": [ 378 { 379 "claim": "RLTHF achieves full-human annotation-level alignment with only 6-7% of human annotation effort", 380 "evidence": "Figure 3 shows RLTHF on 1/4 shard reaches 89.6% preference accuracy on HH-RLHF (vs 91.8% full human) with 6% annotation, and 88.0% on TL;DR (vs 89.6% full human) with 7% annotation (Section 4.1.1).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Models trained on RLTHF-curated datasets outperform those trained on fully human-annotated datasets on downstream tasks", 385 "evidence": "Table 2 shows DPO-trained Qwen2.5-3B win rates: RLTHF(4o) achieves 58.1% vs Human 55.7% on HH-RLHF, and 62.3% vs 60.2% on TL;DR. Evaluated using Claude 3.5 Sonnet as judge (Section 4.2).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "RLTHF achieves 15.9× higher return on investment in human annotation compared to random annotation on HH-RLHF", 390 "evidence": "Section 4.1.2 reports 'With just 6% human annotation, RLTHF (4o) achieves a 15.9× and 5.3× higher ROI compared to Random (4o) on HH-RLHF and TL;DR, respectively.' Supported by Figure 4.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Self-improvement without human annotations fails to exceed the initial AI labeling baseline", 395 "evidence": "Table 1 ablation: 'No Annotation' condition stays at ~75% for HH-RLHF and ~75% for TL;DR across all iterations, while full RLTHF improves to 87.7% and 83.7% respectively (Section 4.1.4).", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Both amplification and back-off mechanisms are essential for RLTHF's effectiveness", 400 "evidence": "Table 1: 'No Ampl./Back-off' condition shows only marginal improvement (75.8% after 5 iterations on HH-RLHF vs 87.7% for full RLTHF). Section 4.1.4 confirms both mechanisms are necessary.", 401 "supported": "strong" 402 }, 403 { 404 "claim": "RLTHF is robust to weaker initial AI labeling quality", 405 "evidence": "Figure 4 and Table 2: GPT-4o mini starts with 2.6%/5.6% accuracy gap vs GPT-4o on HH-RLHF/TL;DR, but after 10% annotation the gap shrinks to 0.4%/-0.2%. Table 2 win rates are comparable (Section 4.1.2).", 406 "supported": "moderate" 407 }, 408 { 409 "claim": "RLTHF reduces overall annotation cost by 84-86%", 410 "evidence": "Appendix F cost analysis: full human annotation costs $5,788.80 vs RLTHF(4o) at $926.70 and RLTHF(4o mini) at $813.30, including LLM and compute overhead (Table 5).", 411 "supported": "moderate" 412 } 413 ], 414 "red_flags": [ 415 { 416 "flag": "Company evaluating its own method", 417 "detail": "All 14 authors are affiliated with Microsoft (one also with UCLA). Microsoft has commercial interest in reducing RLHF annotation costs for its Azure AI and Copilot fine-tuning services. No independent evaluation is included." 418 }, 419 { 420 "flag": "No error bars or statistical tests", 421 "detail": "All comparisons across Figures 3-5 and Tables 1-2 are point estimates from apparently single runs. No standard deviations, confidence intervals, or significance tests are reported despite numerous comparative claims." 422 }, 423 { 424 "flag": "Manipulated ground truth via label flipping", 425 "detail": "Appendix D.1.2: The authors train an RM on all human labels, then flip 25% (HH-RLHF) and 20% (TL;DR) of labels that disagree with their RM, using the result as 'ground truth.' This means ground truth is partially defined by a model, and RLTHF is then evaluated on its ability to recover this model-influenced ground truth. This introduces potential circularity." 426 }, 427 { 428 "flag": "Simulated human-in-the-loop", 429 "detail": "The 'targeted human feedback' in experiments is drawn from existing dataset labels, not from actual human annotators providing feedback in real time. The paper's claims about human annotation efficiency assume this simulation faithfully represents real annotation workflows, which is unvalidated." 430 }, 431 { 432 "flag": "LLM-as-judge for downstream evaluation", 433 "detail": "Win rates in Table 2 are judged by Claude 3.5 Sonnet, not human evaluators. The paper mitigates self-enhancement bias by using a different model family but does not validate whether LLM judge rankings correlate with actual human preferences for these specific tasks." 434 }, 435 { 436 "flag": "No limitations section", 437 "detail": "The paper contains no discussion of limitations, scope boundaries, or threats to validity. The Impact Statement dismisses societal consequences entirely." 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 443 "authors": ["Yuntao Bai"], 444 "year": 2022, 445 "arxiv_id": "2204.05862", 446 "relevance": "Foundational RLHF work and source of the HH-RLHF dataset used in this paper's experiments." 447 }, 448 { 449 "title": "Learning to summarize with human feedback", 450 "authors": ["Nisan Stiennon"], 451 "year": 2020, 452 "relevance": "Seminal work on RLHF for summarization, source of the TL;DR preference dataset used in experiments." 453 }, 454 { 455 "title": "Direct preference optimization: Your language model is secretly a reward model", 456 "authors": ["Rafael Rafailov"], 457 "year": 2024, 458 "relevance": "DPO is a core component of RLTHF's downstream training pipeline and one of the main fine-tuning methods evaluated." 459 }, 460 { 461 "title": "Training language models to follow instructions with human feedback", 462 "authors": ["Long Ouyang"], 463 "year": 2022, 464 "relevance": "InstructGPT paper establishing RLHF as the standard approach for aligning LLMs with human preferences." 465 }, 466 { 467 "title": "RLAIF: Scaling reinforcement learning from human feedback with ai feedback", 468 "authors": ["Harrison Lee"], 469 "year": 2023, 470 "relevance": "Directly addresses the AI feedback alternative to human annotation that RLTHF builds upon and aims to improve." 471 }, 472 { 473 "title": "Self-evolved reward learning for LLMs", 474 "authors": ["Chenghua Huang"], 475 "year": 2024, 476 "arxiv_id": "2411.00418", 477 "relevance": "Self-improvement approach for reward learning that RLTHF's ablation shows fails without human intervention." 478 }, 479 { 480 "title": "Self-rewarding language models", 481 "authors": ["Weizhe Yuan"], 482 "year": 2024, 483 "arxiv_id": "2401.10020", 484 "relevance": "LLM self-improvement paradigm that RLTHF contrasts with by showing human feedback is essential." 485 }, 486 { 487 "title": "Is DPO superior to PPO for LLM alignment? A comprehensive study", 488 "authors": ["Shusheng Xu"], 489 "year": 2024, 490 "arxiv_id": "2404.10719", 491 "relevance": "Comparative study of preference optimization methods relevant to RLTHF's DPO-based downstream training." 492 }, 493 { 494 "title": "Secrets of RLHF in large language models part II: Reward modeling", 495 "authors": ["Binghai Wang"], 496 "year": 2024, 497 "arxiv_id": "2401.06080", 498 "relevance": "Analysis of reward modeling challenges including annotation noise and biases that RLTHF addresses." 499 }, 500 { 501 "title": "Proximal policy optimization algorithms", 502 "authors": ["John Schulman"], 503 "year": 2017, 504 "arxiv_id": "1707.06347", 505 "relevance": "PPO is one of the two main RL algorithms RLTHF supports for downstream LLM fine-tuning." 506 }, 507 { 508 "title": "A critical evaluation of AI feedback for aligning large language models", 509 "authors": ["Archit Sharma"], 510 "year": 2024, 511 "arxiv_id": "2402.12366", 512 "relevance": "Evaluates limitations of AI feedback for alignment, motivating RLTHF's targeted human annotation approach." 513 }, 514 { 515 "title": "Constitutional AI: Harmlessness from AI feedback", 516 "authors": ["Yuntao Bai"], 517 "year": 2022, 518 "arxiv_id": "2212.08073", 519 "relevance": "Foundational RLAIF approach that RLTHF extends by incorporating strategic human corrections." 520 } 521 ], 522 "engagement_factors": { 523 "practical_relevance": { 524 "score": 2, 525 "justification": "Practitioners doing RLHF could apply the targeted annotation strategy to reduce human labeling costs by 84-86%, though no code is released." 526 }, 527 "surprise_contrarian": { 528 "score": 1, 529 "justification": "The finding that RLTHF outperforms full human annotation is mildly surprising, but the general idea that targeted annotation beats random annotation is expected." 530 }, 531 "fear_safety": { 532 "score": 0, 533 "justification": "No safety or security concerns raised; the paper focuses on annotation efficiency for alignment." 534 }, 535 "drama_conflict": { 536 "score": 0, 537 "justification": "No controversy or provocative claims about existing methods or organizations." 538 }, 539 "demo_ability": { 540 "score": 0, 541 "justification": "No code repository, demo, or installable tool is provided." 542 }, 543 "brand_recognition": { 544 "score": 2, 545 "justification": "From Microsoft Research, published at ICML 2025; uses GPT-4o prominently in experiments." 546 } 547 } 548 }