scan.json (31237B)
1 { 2 "paper": { 3 "title": "Transfer Q⋆: Principled Decoding for LLM Alignment", 4 "authors": [ 5 "Souradip Chakraborty", 6 "Soumya Suvra Ghosal", 7 "Ming Yin", 8 "Dinesh Manocha", 9 "Mengdi Wang", 10 "Amrit Singh Bedi", 11 "Furong Huang" 12 ], 13 "year": 2024, 14 "venue": "arXiv", 15 "arxiv_id": "2405.20495" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["theoretical", "benchmark-eval"], 20 "key_findings": "Transfer Q⋆ (TQ⋆) leverages already-aligned baseline language models to estimate the optimal Q-function for decoding-time alignment, offering both direct transfer (when baseline reward matches target) and indirect transfer (via importance sampling when rewards differ). Theorem 1 bounds the suboptimality gap by βDKL(ρ*, ρsft) − αhα(x) and provides KL-efficiency guarantees. Experimentally, TQ⋆ consistently outperforms controlled decoding (CD), ARGS, and DPO across 6 evaluation setups on UltraFeedback, HH-RLHF, and Berkeley Nectar datasets, achieving up to 1.45x average reward improvement and 67.34% GPT-4 win-tie rate over the best baseline.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No code repository URL is provided anywhere in the paper. The paper states 'Reproducibility is ensured through the use of publicly available resources' (Section 4) but this refers to models and datasets, not the authors' own code." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All datasets used are publicly available: UltraFeedback [12], HH-RLHF [5], and Berkeley Nectar [53]. All models used are open-source (Mistral-7B, Zephyr-7B, Pythia-6.9B, etc.) as listed in Table 1." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "Appendix A states 'Python 3.7.4 and PyTorch 1.9.0' and 'two Nvidia RTX A6000 GPUs,' but no requirements.txt, Dockerfile, or comprehensive dependency list is provided. This is insufficient to recreate the full environment." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The algorithm pseudocode (Algorithm 1) describes the method but not how to reproduce the specific experimental results." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results (Figures 2-8, Table 2) report point estimates only. No confidence intervals, error bars, or ± notation appears in any table or figure." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims TQ⋆ 'consistently outperforms' baselines and achieves 'superior efficacy' (Section 4.1) based solely on comparing bar heights and percentages. No statistical significance tests (p-values, t-tests, etc.) are reported." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports specific effect magnitudes: 'up to 1.45x in average reward' and '67.34% in GPT-4 based win-tie rate' (Section 1). Normalized rewards provide baseline context (Appendix G.1 describes the normalization formula). Win-tie percentages in Table 2 give interpretable effect sizes." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "For GPT-4 evaluation, 300 randomly sampled prompts are used (Section 4.1) with no justification for why 300 is sufficient. No power analysis or sample size justification is given for any experiment." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. All figures and tables show single-run point estimates with no indication of result stability." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Multiple baselines are compared: ARGS (SFT and DPO variants) [26], CD (controlled decoding) [33], DPO [38], and the SFT model itself. Table 1 and Figures 2-5 show systematic comparisons." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "All baselines are contemporary: ARGS [26] (2024), CD [33] (2024), and DPO [38] (2023) represent the state of the art in decoding-time and fine-tuning-based alignment at the time of writing." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Appendix H.3 (Figure 8) provides ablations on hyperparameters k (number of sampled tokens) and α (decoding alignment parameter). Section 4.2 also compares direct transfer (DT) vs. indirect transfer variants, showing DT has 'subpar performance' when rewards differ." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Five distinct metrics are used: average reward, diversity (repeated n-gram frequency), coherence (SimCSE cosine similarity), GPT-4 win-tie rate, and KL divergence (Figure 2b). Figures 2, 3, and Table 2 cover these." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "GPT-4 is used as a 'surrogate for human assessment' (Section 4.1) but no actual human evaluation is performed. GPT-4 evaluation is an automated metric, not human evaluation." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The paper evaluates on 'test dataset' and 'test set' (Section 4.1), and for GPT-4 evaluation 'randomly sample 300 prompts from the test set.' The datasets used (UltraFeedback, HH-RLHF, Berkeley Nectar) have standard train/test splits." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down across 6 distinct evaluation setups (Table 1), each with different dataset/model combinations. Figures 2-6 show per-setup results. Additionally, direct vs. indirect transfer and synthetic vs. real transfer are separately analyzed." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": false, 110 "justification": "No qualitative analysis of where TQ⋆ fails. The examples in Appendix I (Examples 0-5) all show TQ⋆ producing the best response. No error analysis or discussion of failure modes is provided." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The direct transfer (DT) variant is shown to have 'subpar performance' in indirect transfer settings (Section 4.2, Figure 4). Ablation results (Figure 8) show that some hyperparameter combinations (e.g., low k with high α) produce poor diversity/coherence." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims of 'significantly reduces the sub-optimality gap' are supported by Theorem 1 (Section 3.3). Claims of 'superior empirical performance across key metrics such as coherence, diversity, and quality' are supported by Figures 2-3 and Table 2. The specific '1.45x' and '67.34%' figures appear in the results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims ('TQ⋆ outperforms,' 'reduces the gap') are supported by controlled comparisons where only the decoding strategy varies while models and datasets are held constant. The ablation study (Figure 8) manipulates single variables (k, α). Theoretical results (Theorem 1) provide principled justification for why the method should improve performance." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims 'Principled Decoding for LLM Alignment' broadly, but all experiments use 7B-parameter models only. No experiments with larger or smaller models are included, and the paper does not explicitly bound its generalization claims to the tested model sizes, datasets, or reward model families." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "No alternative explanations for the observed improvements are discussed. The paper does not consider confounds such as whether the improvement stems from the specific choice of baseline models, the reward model family, or the greedy sampling strategy rather than the TQ⋆ framework itself." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper uses reward model scores as a proxy for 'alignment' and GPT-4 ratings as a proxy for 'human preferences,' but does not acknowledge the gap between these proxies and actual human-judged alignment quality. The paper frames reward maximization as alignment without discussing the limitations of this equivalence." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "Open-source models are specified with identifiable names (Mistral-7B-α, Zephyr-7B-α, Pythia-6.9B, etc.) with citations. However, GPT-4 is used for a key evaluation metric (Table 2) without specifying a version or snapshot date (e.g., gpt-4-0613). Since GPT-4 behavior changes across versions, this affects reproducibility." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": false, 154 "justification": "The GPT-4 evaluation prompt is described only in natural language: 'we prompt GPT-4 to assess and rate two responses on the same prompt on a scale from 1 to 10, focusing on criteria such as relevance, accuracy, and insightfulness' (Section 4.1). The actual prompt text is not provided." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Key hyperparameters are reported: k=10 tokens sampled, α=1 decoding alignment parameter (Section 4), greedy-based sampling, max prompt length 128 tokens, max response length 2048 tokens (Section 4). Ablations on k and α in Appendix H.3." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. TQ⋆ is a token-level decoding strategy that modifies the sampling distribution at each step; there are no tools, workflows, retry logic, or memory management involved." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": false, 169 "justification": "Reward normalization is described in Appendix G.1, and prompt/response length limits are stated. However, no documentation of how datasets were preprocessed, filtered, or split before use. The paper goes directly from naming the datasets to presenting results." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper has no dedicated limitations section. Section 5 (Conclusions) is a brief paragraph with no discussion of limitations. The paper proceeds directly from conclusions to acknowledgments." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of specific weaknesses in the experimental design or theoretical assumptions." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what its results do NOT show. No mention of limitations to 7B models, specific reward model families, or the particular datasets tested. The claims are presented as general without explicit scope boundaries." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "While the source datasets are public, the generated responses, reward scores, and GPT-4 evaluation outputs are not released. Independent verification of the specific experimental results is not possible." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "The data sources are well-described: UltraFeedback [12], HH-RLHF [5], and Berkeley Nectar [53] are cited with their properties. Table 1 summarizes which datasets and model architectures are used for each evaluation setup." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants are involved. All data comes from standard publicly available benchmark datasets." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": false, 208 "justification": "The pipeline from raw datasets to final results is not documented. Reward normalization is described (Appendix G.1) but intermediate steps — how prompts are selected, how test sets are constructed, how responses are generated and scored — are not detailed." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Section 6 (Acknowledgments) lists extensive funding: DARPA TIAMAT, NSF-IIS-2147276, DOD-ONR N00014-22-1-2335, DOD-AFOSR FA9550-23-1-0048, DOD-DARPA GARD HR00112020007, Adobe, Capital One, JP Morgan faculty fellowships, and Army Cooperative Agreement W911NF2120076." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: University of Maryland College Park, Princeton University, and University of Central Florida. These are academic institutions not producing the evaluated models." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Funders are primarily government agencies (DARPA, NSF, DOD, Army) and corporate fellowships (Adobe, Capital One, JP Morgan). None of these entities has a direct financial stake in whether TQ⋆ outperforms specific decoding baselines." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is included in the paper. The absence of such a declaration is noted — absence of disclosure is not the same as absence of conflict." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the models used (Mistral-7B, Zephyr-7B, Pythia-6.9B, OpenChat, Starling, Llama-2, Tulu, Gemma). Since these models generate responses on datasets that may overlap with their training data, this is a relevant omission." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the evaluation prompts (from UltraFeedback, HH-RLHF, Berkeley Nectar) appeared in the training data of the models being evaluated. This could differentially affect generation quality." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The evaluation datasets (UltraFeedback published 2023, HH-RLHF published 2022) were available before or around the training of the models used, yet no contamination analysis is performed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in this study. All evaluations are automated." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. Purely computational experiments using publicly available models and datasets." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants involved." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants involved." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants involved." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants involved." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants involved." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "Despite the paper positioning TQ⋆ as a 'lightweight and adaptable framework,' no actual inference costs, latency measurements, or tokens consumed are reported. The cost of sampling k=10 tokens per step and generating completions from ρBL per candidate is not quantified." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Appendix A mentions 'two Nvidia RTX A6000 GPUs' but does not state total GPU hours, wall-clock time, or compute budget for the experiments." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs producing the reported results is never stated. It is unclear whether results are from single or averaged runs." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Ablations on k and α are shown (Figure 8), but the total number of configurations tried and the compute spent on hyperparameter selection are not reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "The selection of k=10 and α=1 is justified by the ablation study in Figure 8, which shows these values produce the highest diversity and coherence. The selection criteria are transparent." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No formal statistical tests are performed in this paper, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "Footnote 1 acknowledges 'Due to unavailability of code base, we compare using an approximate version of CD [33] in which we do not train an adapter module.' This approximate re-implementation could disadvantage the baseline, but the authors do not discuss this self-comparison bias." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "TQ⋆ requires generating full trajectory completions from ρBL for each of k candidate tokens at each decoding step, which is substantially more expensive than standard decoding. This compute overhead is never discussed or compared against baselines." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper uses reward model scores and GPT-4 ratings as primary metrics without discussing whether these metrics actually measure alignment quality. No discussion of construct validity for any benchmark or metric used." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. TQ⋆ is a token-level decoding modification, not an agentic system." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of temporal leakage. The evaluation datasets (UltraFeedback 2023, HH-RLHF 2022) may have been available before or during the training of models like Mistral-7B and Zephyr-7B, potentially affecting generation quality." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. For example, whether the DPO-aligned baseline model (which serves as ρBL) may have been fine-tuned on the same evaluation prompts is not addressed." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether training and evaluation data are independent. The DPO models were trained on subsets of the same datasets used for evaluation, and this potential overlap is not examined." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention methods are used — no canary strings, membership inference, or decontamination procedures." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "TQ⋆ consistently outperforms SoTA decoding baselines (CD, ARGS, DPO) in average reward across all 6 evaluation setups.", 372 "evidence": "Figures 2(a,c,d) and Figure 6(a,b,c) show normalized average rewards across all setups in Table 1. TQ⋆ achieves the highest bar in every comparison.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "TQ⋆ achieves up to 1.45x improvement in average reward and 67.34% GPT-4 based win-tie rate over CD.", 377 "evidence": "Section 1 and Table 2 report GPT-4 win-tie rates of 66.67%, 65.34%, and 67.34% over CD across the three evaluation setups. The 1.45x figure is stated but the specific evaluation setup producing it is not clearly identified.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "TQ⋆ produces responses with the highest coherence and diversity compared to all baselines.", 382 "evidence": "Figure 3 shows diversity and coherence analysis on Berkeley Nectar dataset. TQ⋆ achieves the highest values in both metrics.", 383 "supported": "weak" 384 }, 385 { 386 "claim": "The suboptimality gap is upper bounded by βDKL(ρ*, ρsft) − αhα(x), providing 'double robustness.'", 387 "evidence": "Theorem 1, statement 1 (Section 3.3) with full proof in Appendix E.1. The bound is mathematically derived under the stated assumptions.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Indirect transfer decoding outperforms direct transfer when source and target rewards differ significantly.", 392 "evidence": "Figures 4 and 5 show that the direct transfer variant (DT) has 'subpar performance' compared to TQ⋆ in both synthetic and real indirect transfer settings. Section 4.2 explicitly notes this.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "TQ⋆ is effective even when there are substantial discrepancies between target and baseline rewards.", 397 "evidence": "Figures 5(c,d) show that even with significant distribution shift between source and target reward values (HH-RLHF setup), TQ⋆ still outperforms all competitive decoding approaches.", 398 "supported": "moderate" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "Approximate baseline implementation", 404 "detail": "Footnote 1 acknowledges that CD [33] is compared using 'an approximate version' because the codebase was unavailable, specifically omitting the trained adapter module. This could systematically disadvantage the primary SoTA baseline, inflating TQ⋆'s relative performance." 405 }, 406 { 407 "flag": "No error bars or variance reporting", 408 "detail": "All results across all figures and tables are single point estimates with no uncertainty quantification. Without multiple runs, it is impossible to assess whether the observed differences are meaningful or within noise." 409 }, 410 { 411 "flag": "No limitations section", 412 "detail": "The paper has no limitations, threats-to-validity, or scope-boundary discussion. The claims are presented as broadly applicable without acknowledging restrictions to 7B models, specific reward families, or particular datasets." 413 }, 414 { 415 "flag": "GPT-4 as human evaluation surrogate without validation", 416 "detail": "GPT-4 is used as a 'surrogate for human assessment' (Section 4.1) without any validation that GPT-4 ratings correlate with actual human preferences in this setting, and without specifying the GPT-4 version used." 417 }, 418 { 419 "flag": "Compute cost of TQ⋆ not discussed", 420 "detail": "TQ⋆ requires generating full trajectory completions from ρBL for each of k=10 candidate tokens at every decoding step. This is potentially orders of magnitude more expensive than standard or CD decoding, yet no latency or cost comparison is provided." 421 }, 422 { 423 "flag": "Only successes shown in examples", 424 "detail": "Appendix I shows 6 generated text examples where TQ⋆ always produces the best response. No examples of failure cases or situations where baselines outperform TQ⋆ are shown, suggesting selective presentation." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Direct preference optimization: Your language model is secretly a reward model", 430 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D. Manning", "Chelsea Finn"], 431 "year": 2023, 432 "relevance": "Core alignment method (DPO) used as both a baseline and a building block for TQ⋆'s baseline models." 433 }, 434 { 435 "title": "Controlled decoding from language models", 436 "authors": ["Sidharth Mudgal", "Jong Lee", "Harish Ganapathy"], 437 "year": 2024, 438 "relevance": "Primary SoTA baseline for decoding-time alignment that TQ⋆ aims to improve upon." 439 }, 440 { 441 "title": "ARGS: Alignment as reward-guided search", 442 "authors": ["Maxim Khanov", "Jirayu Burapacheep", "Yixuan Li"], 443 "year": 2024, 444 "relevance": "Decoding-time alignment baseline that adjusts generation probabilities based on reward model feedback." 445 }, 446 { 447 "title": "Training language models to follow instructions with human feedback", 448 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 449 "year": 2022, 450 "relevance": "Foundational RLHF work (InstructGPT) establishing the alignment framework that TQ⋆ builds upon." 451 }, 452 { 453 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 454 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 455 "year": 2022, 456 "arxiv_id": "2204.05862", 457 "relevance": "HH-RLHF dataset and methodology used as one of the three evaluation benchmarks." 458 }, 459 { 460 "title": "Deep reinforcement learning from human preferences", 461 "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown"], 462 "year": 2017, 463 "relevance": "Foundational work on learning reward models from human preferences, the basis of RLHF pipelines." 464 }, 465 { 466 "title": "Learning to summarize with human feedback", 467 "authors": ["Nisan Stiennon", "Long Ouyang", "Jeffrey Wu"], 468 "year": 2020, 469 "relevance": "Extended RLHF to language model training for summarization, demonstrating effectiveness over supervised fine-tuning." 470 }, 471 { 472 "title": "DEAL: Decoding-time alignment for large language models", 473 "authors": ["James Y Huang", "Sailik Sengupta", "Daniele Bonadiman"], 474 "year": 2024, 475 "arxiv_id": "2402.06147", 476 "relevance": "Related decoding-time alignment approach that re-conceptualizes text generation as a search problem." 477 }, 478 { 479 "title": "Decoding-time realignment of language models", 480 "authors": ["Tianlin Liu", "Shangmin Guo", "Leonardo Bianco"], 481 "year": 2024, 482 "arxiv_id": "2402.02992", 483 "relevance": "Proposes multiplicative reweighting of generation probabilities using importance ratios from aligned and reference models." 484 }, 485 { 486 "title": "UltraFeedback: Boosting language models with high-quality feedback", 487 "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"], 488 "year": 2023, 489 "arxiv_id": "2310.01377", 490 "relevance": "Primary evaluation dataset used in 4 of 6 evaluation setups for testing alignment via decoding." 491 }, 492 { 493 "title": "Self-play fine-tuning converts weak language models to strong language models", 494 "authors": ["Zixiang Chen", "Yihe Deng", "Huizhuo Yuan"], 495 "year": 2024, 496 "arxiv_id": "2401.01335", 497 "relevance": "Alternative alignment approach using self-play, relevant to the broader landscape of LLM alignment methods." 498 }, 499 { 500 "title": "Scaling laws for reward model overoptimization", 501 "authors": ["Leo Gao", "John Schulman", "Jacob Hilton"], 502 "year": 2023, 503 "relevance": "Analyzes reward-KL tradeoff curves used as a performance framework in TQ⋆'s theoretical analysis." 504 } 505 ], 506 "engagement_factors": { 507 "practical_relevance": { 508 "score": 2, 509 "justification": "The decoding strategy is implementable by practitioners with access to aligned baseline models and target reward models, but no code is released." 510 }, 511 "surprise_contrarian": { 512 "score": 1, 513 "justification": "The idea of leveraging existing aligned models for better Q-function estimation is a reasonable extension, not a paradigm challenge." 514 }, 515 "fear_safety": { 516 "score": 1, 517 "justification": "The paper is about improving alignment, which relates to AI safety, but does not expose new risks or attack vectors." 518 }, 519 "drama_conflict": { 520 "score": 0, 521 "justification": "No controversy or conflict angle — a straightforward technical contribution." 522 }, 523 "demo_ability": { 524 "score": 0, 525 "justification": "No code, demo, or pip-installable tool is provided." 526 }, 527 "brand_recognition": { 528 "score": 1, 529 "justification": "Authors from University of Maryland and Princeton are respectable academic institutions but not marquee AI labs." 530 } 531 } 532 }