ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30195B)


      1 {
      2   "paper": {
      3     "title": "Getting More Juice Out of the SFT Data: Reward Learning from Human Demonstration Improves SFT for LLM Alignment",
      4     "authors": [
      5       "Jiaxiang Li",
      6       "Siliang Zeng",
      7       "Hoi-To Wai",
      8       "Chenliang Li",
      9       "Alfredo Garcia",
     10       "Mingyi Hong"
     11     ],
     12     "year": 2024,
     13     "venue": "Neural Information Processing Systems",
     14     "arxiv_id": "2405.17888",
     15     "doi": "10.48550/arXiv.2405.17888"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [
     19     "experimental_rigor",
     20     "data_leakage"
     21   ],
     22   "methodology_tags": [
     23     "benchmark-eval",
     24     "theoretical"
     25   ],
     26   "key_findings": "The paper proposes two algorithms (RFT and IRFT) that learn reward models from demonstration data during SFT via inverse reinforcement learning. IRFT improves zephyr-7b-sft-full from 59.48% to 61.03% on the HuggingFace Open LLM Leaderboard. The implicit reward learned from demonstration data alone can distinguish preferred from non-preferred continuations without seeing preference data. IRFT generalizes SPIN as a special case (T=1) and is shown to converge to stationary points with O(1/√TK) rate.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Code is released at https://github.com/JasonJiaxiangLi/Reward_learning_SFT, stated in the abstract and confirmed in the NeurIPS checklist as released upon acceptance."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "All datasets used are publicly available: Anthropic-HH (Bai et al., 2022) and Ultrachat200k from HuggingFace. No proprietary data was collected."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Appendix C specifies: DeepSpeed ZeRO-3, FlashAttention-2, RMSProp optimizer, NVIDIA A100-40G GPUs (2 for 1B, 8 for 7B), bfloat16 precision, TRL package for PPO, and Language Model Evaluation Harness v0.4.2."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "While code is released and Appendix C provides hyperparameters, the paper does not contain explicit step-by-step reproduction instructions (e.g., specific commands to run). The paper says 'We follow the code as in SPIN' but does not provide a self-contained reproduction guide."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Figure 2 shows error bands for the reward/win-rate plots, but the main benchmark results in Tables 3, 4, and 6 report only point estimates with no confidence intervals or error bars."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No statistical significance tests are reported. Claims like 'the proposed methods show significant performance improvement' are based on raw number comparisons (e.g., 59.48% vs 61.03%) without any p-values or tests."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Performance is reported with baseline context across all experiments. Tables 3 and 4 show absolute scores for each method and task, allowing computation of improvement magnitudes (e.g., 59.48% → 61.03% = 1.55pp improvement)."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No justification for the choice of 10k data from Anthropic-HH or 50k from Ultrachat200k. No power analysis. The 50k selection follows SPIN's convention but is not independently justified."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "Figure 2 shows variance bands for reward/win-rate curves, but the main benchmark results in Tables 3, 4, and 6 report single-run numbers with no standard deviations or variance across runs."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include the pretrained model, standard SFT, and SPIN (Chen et al., 2024) across all experiments in Tables 3, 4, and 6."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "SPIN (Chen et al., 2024) is a contemporary baseline from the same year. DPO is discussed conceptually. The baselines represent recent approaches to SFT improvement."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Tables 3, 4, and 6 systematically vary the outer loop count T and inner loop count K, effectively ablating the key hyperparameters of the proposed algorithms. Both RFT and IRFT variants are tested."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Six metrics from the Open LLM Leaderboard are used: AI2_Arc, TruthfulQA, Winogrande, GSM8k, HellaSwag, and MMLU. Additionally, reward scores and win rates are reported for Anthropic-HH experiments."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "All evaluation is automated via the Open LLM Leaderboard benchmarks and a reward model (PKU-Alignment/beaver-7b-v3.0-reward). No human evaluation of model outputs was conducted, though Table 5 shows qualitative generation examples without human rating."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Open LLM Leaderboard uses standard test splits. For Anthropic-HH, they evaluate on a test dataset separate from the 10k training set (Figure 2 caption: 'average score of test dataset')."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Tables 3, 4, and 6 provide per-task breakdowns across all six Open LLM Leaderboard tasks, not just averages."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "No dedicated failure analysis. The paper notes '1b model is not strong enough to handle hard tasks, e.g. GSM8k' but does not analyze where or why the proposed methods fail. Table 5 shows only successful generation examples."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Table 4 shows IRFT T=8 (59.85%) and T=16 (60.25%) underperforming other IRFT configurations. The paper notes 'more frequent generation might also result in more variances' and acknowledges that hyperparameter tuning is needed."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract claims 'significant performance improvement over existing SFT approaches' and cites 59.47%→61.03%. Table 4 confirms: base zephyr-7b-sft-full at 59.48% and IRFT T=10 at 61.03%. Convergence guarantees are proven in Theorem 3.1."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Causal claims ('reward learning improves SFT') are supported by controlled experiments varying only the training method while holding model, data, and hyperparameters fixed. Ablation over T and K values provides additional causal evidence through single-variable manipulation."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title claims 'Reward Learning from Human Demonstration Improves SFT for LLM Alignment' broadly, but experiments are limited to pythia-1.4b and zephyr-7b-sft-full on two datasets. The conclusion acknowledges 'future works include exploring reward-learning for larger models' but the broad framing is not adequately bounded."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No discussion of alternative explanations for the observed improvements. For example, the improvement could be due to the additional training compute (generation + extra training steps) rather than reward learning per se. The SPIN comparison partially addresses this but not explicitly."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper measures Open LLM Leaderboard scores (commonsense reasoning, math, knowledge) and frames this as 'alignment' improvement. The gap between benchmark performance and actual alignment quality is not acknowledged — benchmark scores are treated as direct measures of alignment."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Specific model names are provided: pythia-1b, pythia-1.4b, zephyr-7b-sft-full, PKU-Alignment/beaver-7b-v3.0-reward. These are HuggingFace model identifiers that pinpoint exact model versions."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix C states: 'We use the same prompt template \"### Instruction: prompt\\n\\n### Response:\" as in Chen et al. [2024].' The template is fully specified and input content comes from public datasets."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Appendix C reports: learning rate 5e-7 (first 2 epochs) / 1e-7 (next 2), β=0.1, max sequence length 1024 (1B) / 2048 (7B), per-device batch sizes of 4/8/2, RMSProp optimizer, bfloat16 precision, 2 epochs per generation."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. The methods are training algorithms (SFT, reward learning, policy optimization) without any agentic workflow."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5.1 describes: for Anthropic-HH, only chosen/preferred continuations are kept; top 10k data selected by beaver-7b-v3.0-reward scores. For Ultrachat200k, same 50k selection strategy as SPIN. Training for 2 epochs per generation round."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 6 'Conclusions and Limitations' discusses: convergence only to stationary points (not global optima), unclear what the policy converges to, and non-negligible additional computation costs."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The limitations discuss theoretical scope (stationary point convergence) and computational cost, but do not address specific threats to validity of the empirical results, such as whether improvements are within noise margins, confounding from additional compute, or model-specific effects."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper mentions 'future works include exploring reward-learning for larger models and more complicated demonstration tasks' but does not explicitly state what the current results do NOT show, such as which settings or model scales the findings should not be generalized to."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "All experimental data comes from publicly available datasets: Anthropic-HH and Ultrachat200k from HuggingFace. Code for reproducing experiments is released on GitHub."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 5.1 describes data collection: Anthropic-HH provides preference data from which only chosen continuations are used; 10k selected by reward scores. Ultrachat200k is a subset of UltraChat with 50k selected for training."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants. All data comes from standard public benchmarks (Anthropic-HH, Ultrachat200k, Open LLM Leaderboard tasks)."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The pipeline is documented: raw datasets → selection of chosen/preferred continuations → filtering by reward score (for Anthropic-HH) or following SPIN's 50k strategy → training with 2 epochs per generation → evaluation on test splits."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgements section: 'M. Hong, S. Zeng and J. Li are supported partially by NSF under the grants EPCN-2311007, ECCS-2426064 and CCF-1910385, also by Minnesota Supercomputing Institute. A. Garcia and C. Li are partially supported by ECCS-2240789.'"
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All six authors list university affiliations: University of Minnesota, Chinese University of Hong Kong, and Texas A&M University. No evaluated product is affiliated with these institutions."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Funding comes from NSF grants and Minnesota Supercomputing Institute. NSF is an independent federal agency with no financial interest in the outcome of LLM alignment research."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is included in the paper."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The training data cutoff for pythia-1.4b and zephyr-7b-sft-full is not stated. These models are evaluated on Open LLM Leaderboard benchmarks that may have been in their pre-training data."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether Open LLM Leaderboard benchmark data (Arc, HellaSwag, MMLU, etc.) appeared in the pre-training data of pythia or zephyr models."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The paper uses benchmarks (Arc 2018, HellaSwag 2019, MMLU 2020, Winogrande 2021) that were all published before the models' training data was collected, creating contamination risk. This is not discussed."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study. All experiments involve model training and automated benchmark evaluation."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. Study involves only LLM training algorithms and automated evaluation."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Table 2 provides qualitative cost comparisons (e.g., 'SFT+Generation' time) but no actual wall-clock times, GPU hours, or dollar costs are reported for any experiment."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Hardware is specified (2× or 8× NVIDIA A100-40G) but total GPU hours, training time, or total compute budget is not stated for any experiment."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No mention of multiple random seeds. Tables 3, 4, and 6 appear to report single-run results without seed variation analysis."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The number of experimental runs is not explicitly stated. Results appear to be from single runs."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Different T and K values are tested (Tables 3, 4, 6) but no formal hyperparameter search budget is reported. The search space and total configurations tried are not quantified."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Tables 3, 4, and 6 report all tested configurations (various T and K combinations) rather than cherry-picking the best result. The paper recommends T=5 as a default based on the observed pattern across experiments."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Multiple comparisons are made across methods and tasks without any significance tests, let alone multiple comparison corrections."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors implement their own baselines (SFT, SPIN) without acknowledging potential author-implementation bias. No independent evaluation or discussion of this bias."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Table 2 provides qualitative cost categories but performance is not plotted as a function of compute. IRFT uses more compute than SFT (generation overhead) but this is not controlled for in comparisons."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper uses Open LLM Leaderboard benchmarks to measure 'alignment' quality without discussing whether these benchmarks actually measure alignment. Commonsense reasoning and math tasks are proxies for alignment at best."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "No scaffolding is involved. The methods are training algorithms evaluated directly through standard benchmark pipelines."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the benchmark tasks (Arc 2018, MMLU 2020, etc.) existed before the models' pre-training data was collected."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the evaluation setup provides information that would not be available in real deployment."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of independence between training and evaluation data. The SFT data (Anthropic-HH, Ultrachat) and evaluation benchmarks may share distributional overlap."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No leakage detection or prevention methods are applied (no canary strings, membership inference, decontamination, or temporal splits)."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Reward learning from demonstration data significantly improves SFT for LLM alignment, with IRFT improving zephyr-7b-sft-full from 59.48% to 61.03% average on the Open LLM Leaderboard.",
    378       "evidence": "Table 4 shows zephyr-7b-sft-full base at 59.48% average, IRFT T=10 at 61.03%. Multiple IRFT configurations outperform both SFT and the base model.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "IRL-based methods can distinguish preferred from non-preferred continuations even when trained only on preferred/chosen demonstration data.",
    383       "evidence": "Figure 1 (right) shows log probability gap between chosen and rejected continuations. RFT and IRFT produce positive gaps while SFT assigns higher probability to non-preferred data. Example 2 in Section 3.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "The proposed algorithms (RFT and IRFT) converge to stationary solutions of the IRL problem at rate O(1/√TK).",
    388       "evidence": "Theorem 3.1 with formal proof in Appendix B under Assumption B.1 (bounded policy, bounded gradients, Lipschitz gradients).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "IRFT generalizes SPIN as a special case: IRFT with T=1 is equivalent to SPIN.",
    393       "evidence": "Section 4 shows mathematically that when T=1 and K equals total training iterations, IRFT reduces to SPIN. Confirmed empirically in Tables 3 and 4 where IRFT T=1 rows match SPIN.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "More frequent generation (T>1) in IRFT outperforms the single-generation SPIN approach, with T≈5-10 being optimal.",
    398       "evidence": "Tables 3 and 4 show IRFT with various T values. For 7B: SPIN (T=1) 60.32%, IRFT T=5 at 60.71%, T=10 at 61.03%. But T=8 at 59.85% is worse, suggesting sensitivity.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "The reward model learned from only demonstration data (via RFT) already possesses strong capability in distinguishing chosen and rejected responses.",
    403       "evidence": "Figure 1 (right) and Figure 2 showing win rate of RFT over SFT models reaching ~0.8. The reward is learned without any preference data.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars on main benchmark results",
    410       "detail": "Tables 3, 4, and 6 report single-run point estimates. Differences as small as 0.08% (e.g., 37.27 vs 37.35 in Table 3) are presented as meaningful improvements without any uncertainty quantification. These could easily be within noise."
    411     },
    412     {
    413       "flag": "No significance tests for claimed improvements",
    414       "detail": "The paper claims 'significant performance improvement' based solely on comparing raw numbers. No statistical tests are used to establish that 59.48%→61.03% is statistically significant rather than random variation."
    415     },
    416     {
    417       "flag": "Inconsistent hyperparameter sensitivity",
    418       "detail": "IRFT T=8 on 7B model (59.85%) actually underperforms IRFT T=1/SPIN (60.32%), while T=10 is the best (61.03%). This non-monotonic pattern and sensitivity to T suggests results may be fragile and the 'optimal' T may be coincidental."
    419     },
    420     {
    421       "flag": "Compute confound not controlled",
    422       "detail": "IRFT with T>1 involves more generation passes than SFT or SPIN, meaning more total compute. The improvement could be partially or fully explained by additional training compute rather than the reward learning mechanism itself. Table 2 acknowledges the cost difference but does not control for it experimentally."
    423     },
    424     {
    425       "flag": "Benchmark-alignment proxy gap unacknowledged",
    426       "detail": "Open LLM Leaderboard tasks (commonsense reasoning, math, knowledge) are used as proxies for 'alignment quality' without discussion of whether these benchmarks actually measure the alignment improvements claimed."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Deep reinforcement learning from human preferences",
    432       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown", "Miljan Martic", "Shane Legg", "Dario Amodei"],
    433       "year": 2017,
    434       "relevance": "Foundational RLHF paper that established the reward-learning-then-RL pipeline for aligning models with human preferences."
    435     },
    436     {
    437       "title": "Direct preference optimization: Your language model is secretly a reward model",
    438       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    439       "year": 2024,
    440       "relevance": "DPO is a key baseline approach for alignment that bypasses explicit reward modeling; this paper builds on DPO's implicit reward identity."
    441     },
    442     {
    443       "title": "Self-play fine-tuning converts weak language models to strong language models",
    444       "authors": ["Zixiang Chen", "Yihe Deng", "Huizhuo Yuan", "Kaixuan Ji", "Quanquan Gu"],
    445       "year": 2024,
    446       "arxiv_id": "2401.01335",
    447       "relevance": "SPIN is a primary baseline and the paper shows IRFT generalizes SPIN; both use demonstration data with self-generated synthetic negatives."
    448     },
    449     {
    450       "title": "Training language models to follow instructions with human feedback",
    451       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    452       "year": 2022,
    453       "relevance": "InstructGPT paper that established the SFT→RLHF pipeline used in ChatGPT, which this work seeks to improve at the SFT stage."
    454     },
    455     {
    456       "title": "Zephyr: Direct distillation of lm alignment",
    457       "authors": ["Lewis Tunstall", "Edward Beeching", "Nathan Lambert"],
    458       "year": 2023,
    459       "arxiv_id": "2310.16944",
    460       "relevance": "Provides the zephyr-7b-sft-full base model used in experiments and represents the DPO alignment approach."
    461     },
    462     {
    463       "title": "Is DPO superior to PPO for LLM alignment? A comprehensive study",
    464       "authors": ["Shusheng Xu", "Wei Fu", "Jiaxuan Gao"],
    465       "year": 2024,
    466       "arxiv_id": "2404.10719",
    467       "relevance": "Comprehensive comparison of reward-based vs reward-free LLM alignment methods, directly relevant to the reward learning debate."
    468     },
    469     {
    470       "title": "Proximal policy optimization algorithms",
    471       "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal", "Alec Radford", "Oleg Klimov"],
    472       "year": 2017,
    473       "arxiv_id": "1707.06347",
    474       "relevance": "PPO is used for the policy optimization step in Algorithm 1 (RFT) and is the standard RL algorithm in RLHF pipelines."
    475     },
    476     {
    477       "title": "Self-rewarding language models",
    478       "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang", "Kyunghyun Cho"],
    479       "year": 2024,
    480       "arxiv_id": "2401.10020",
    481       "relevance": "Related approach using self-generated rewards for LLM improvement, part of the broader trend of reward learning without explicit preference data."
    482     },
    483     {
    484       "title": "Safe RLHF: Safe reinforcement learning from human feedback",
    485       "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun", "Jiaming Ji"],
    486       "year": 2024,
    487       "relevance": "Provides the beaver-7b-v3.0-reward model used as the ground-truth reward evaluator in the Anthropic-HH experiments."
    488     },
    489     {
    490       "title": "Back to basics: Revisiting reinforce style optimization for learning from human feedback in LLMs",
    491       "authors": ["Arash Ahmadian", "Chris Cremer", "Matthias Gallé"],
    492       "year": 2024,
    493       "arxiv_id": "2402.14740",
    494       "relevance": "Revisits REINFORCE-based policy optimization for RLHF, part of the ongoing exploration of RL methods for LLM alignment."
    495     },
    496     {
    497       "title": "Weak-to-strong generalization: Eliciting strong capabilities with weak supervision",
    498       "authors": ["Collin Burns", "Pavel Izmailov", "Jan Hendrik Kirchner"],
    499       "year": 2023,
    500       "arxiv_id": "2312.09390",
    501       "relevance": "Explores using weaker supervision signals to improve stronger models, related to the question of extracting more signal from limited demonstration data."
    502     },
    503     {
    504       "title": "Maximum-likelihood inverse reinforcement learning with finite-time guarantees",
    505       "authors": ["Siliang Zeng", "Chenliang Li", "Alfredo Garcia", "Mingyi Hong"],
    506       "year": 2022,
    507       "relevance": "Foundational IRL work by the same authors that provides the ML-IRL framework used in this paper's formulation."
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 2,
    513       "justification": "Practitioners doing SFT alignment could apply IRFT as a drop-in improvement; code is released and the method uses standard training infrastructure."
    514     },
    515     "surprise_contrarian": {
    516       "score": 1,
    517       "justification": "Challenges the assumption that reward learning requires preference data, but IRL from demonstrations is well-established in the RL literature."
    518     },
    519     "fear_safety": {
    520       "score": 0,
    521       "justification": "No safety or security concerns raised; the work is about improving SFT training efficiency."
    522     },
    523     "drama_conflict": {
    524       "score": 0,
    525       "justification": "No controversy; the paper takes a constructive approach building on existing methods (SPIN, DPO, RLHF)."
    526     },
    527     "demo_ability": {
    528       "score": 2,
    529       "justification": "Code released on GitHub with implementation following SPIN's codebase, making it runnable by researchers with appropriate GPU resources."
    530     },
    531     "brand_recognition": {
    532       "score": 0,
    533       "justification": "University research groups (Minnesota, CUHK, Texas A&M) without major brand recognition in the LLM space."
    534     }
    535   }
    536 }

Impressum · Datenschutz