ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (16712B)


      1 {
      2   "paper": {
      3     "title": "A Comprehensive Survey of LLM Alignment Techniques: RLHF, RLAIF, PPO, DPO and More",
      4     "authors": ["Zhichao Wang", "Bin Bi", "Shiva Kumar Pentyala", "Kiran Ramnath", "Sougata Chaudhuri", "Shubham Mehrotra", "Zixu (James) Zhu", "Xiang-Bo Mao", "Sitaram Asur", "Na (Claire) Cheng"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2407.16216"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code repository or analysis scripts are mentioned or linked in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset of surveyed papers, extracted features, or analysis data is released."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a survey paper with no computational experiments requiring an environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions are provided for reproducing the survey methodology or paper selection process."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Survey paper with no original experiments or statistical aggregation."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No original experiments are conducted; the paper summarizes existing work."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original experiments or meta-analysis requiring effect size reporting."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No original experiments conducted."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experiments conducted."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The survey does not compare itself against prior surveys on LLM alignment or discuss how it improves upon them."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No baselines are included, so contemporaneity is not applicable."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Survey paper with no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper with no experiments requiring metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate; this is a literature review."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper with no experiments."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 1 provides a detailed breakdown of all surveyed papers across 13 categorical dimensions (reward model type, feedback type, RL policy type, optimization type, etc.)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses limitations of individual methods (e.g., DPO's sensitivity to distribution shifts in Section 3.3.3, DPO's reward degradation issue in Section 3.3.4, overfitting in RLHF/DPO in Section 3.3.6)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports negative findings from surveyed work, such as DPO's failure modes with small edit distances (Section 3.3.4), alignment tax on smaller models (Section 3.1.2), and low inter-annotator agreement for Anthropic crowdworkers (~63%, Section 3.1.2)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to provide a comprehensive categorization of alignment methods, which the paper delivers through its 13-dimensional taxonomy in Table 1 and detailed individual paper reviews in Section 3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims of its own; it reports causal findings from surveyed papers."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper does not explicitly state scope boundaries or limitations of its survey coverage. No discussion of what alignment methods or perspectives are excluded."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "As a pure taxonomy/survey with no empirical results of its own, alternative explanations are not applicable."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "Survey paper that does not use any models."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this survey."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are conducted."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper does not describe how papers were selected for inclusion. No search queries, databases, inclusion/exclusion criteria, or filtering pipeline is documented. Papers appear to be selected without a documented systematic methodology."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No limitations or threats-to-validity section is present in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what is out of scope. No discussion of which alignment approaches, time periods, or publication venues are excluded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (list of surveyed papers, search results, extraction data) is made available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The paper does not describe how the surveyed papers were identified or collected. No search methodology is documented."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is published research papers rather than a standard benchmark."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of the pipeline from paper discovery to final inclusion in the survey."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding statement or acknowledgments section is present. All authors are from Salesforce but no explicit funding disclosure."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed with Salesforce affiliation on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Salesforce is a major AI company with products that use LLM alignment techniques. No disclosure of whether Salesforce has a stake in how alignment methods are characterized."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this survey paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper with no method of its own to cost."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper with no computational experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "This is the first comprehensive survey categorizing LLM alignment methods across reward model, feedback, RL, and optimization dimensions.",
    286       "evidence": "Abstract and Section 1 state 'there has not been a comprehensive survey paper that categorizes and details these approaches.'",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The paper categorizes alignment methods into 13 evaluation metrics across 4 main topics.",
    291       "evidence": "Table 1 presents all surveyed papers classified across 13 dimensions. Section 2 describes the categorical outline.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "DPO is sensitive to distribution shifts between base model outputs and preference data.",
    296       "evidence": "Section 3.3.3 cites reference [53] for this finding. The paper does not provide its own evidence.",
    297       "supported": "moderate"
    298     }
    299   ],
    300   "methodology_tags": ["meta-analysis"],
    301   "key_findings": "This survey categorizes approximately 30 LLM alignment methods across 13 dimensions organized into four main topics: reward models (explicit vs. implicit, pointwise vs. preference, response-level vs. token-level, negative preference), feedback (preference vs. binary, pairwise vs. listwise, human vs. AI), reinforcement learning (reference-based vs. free, length control, divergence types, on/off-policy), and optimization (online vs. offline, separated vs. merged SFT). The paper provides detailed mathematical derivations and individual reviews for each method, serving as a reference for the alignment research landscape as of mid-2024.",
    302   "red_flags": [
    303     {
    304       "flag": "No systematic review methodology",
    305       "detail": "The paper does not describe how papers were selected for inclusion. No search queries, databases, inclusion/exclusion criteria, or PRISMA-style flow diagram is provided. The selection appears ad hoc, making it impossible to assess completeness or bias in coverage."
    306     },
    307     {
    308       "flag": "No quality assessment of surveyed papers",
    309       "detail": "The survey summarizes each paper's claims and methods without assessing the quality of evidence or methodology. It uncritically reports claims from surveyed papers without noting methodological weaknesses, potentially laundering the signal-to-noise ratio of its sources."
    310     },
    311     {
    312       "flag": "No limitations section",
    313       "detail": "The paper has no limitations, threats to validity, or scope boundaries section despite being a comprehensive survey claiming to fill a gap in the literature."
    314     },
    315     {
    316       "flag": "Corporate affiliation without conflict disclosure",
    317       "detail": "All authors are from Salesforce, a company with commercial LLM products that use alignment techniques. No conflict of interest statement is provided."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Training language models to follow instructions with human feedback",
    323       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    324       "year": 2022,
    325       "relevance": "Foundational RLHF/InstructGPT paper establishing human preference alignment methodology for LLMs."
    326     },
    327     {
    328       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    329       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    330       "year": 2022,
    331       "relevance": "Anthropic's RLHF work on helpfulness and harmlessness trade-offs in alignment."
    332     },
    333     {
    334       "title": "Direct preference optimization: Your language model is secretly a reward model",
    335       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    336       "year": 2023,
    337       "relevance": "Introduced DPO as a simpler alternative to RLHF, widely adopted in LLM alignment."
    338     },
    339     {
    340       "title": "Constitutional ai: Harmlessness from ai feedback",
    341       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    342       "year": 2022,
    343       "relevance": "RLAIF approach using AI-generated feedback for alignment, reducing human annotation costs."
    344     },
    345     {
    346       "title": "KTO: Model alignment as prospect theoretic optimization",
    347       "authors": ["Kawin Ethayarajh", "Winnie Xu", "Niklas Muennighoff"],
    348       "year": 2024,
    349       "relevance": "Alternative alignment approach using binary feedback instead of pairwise preferences."
    350     },
    351     {
    352       "title": "SimPO: Simple preference optimization with a reference-free reward",
    353       "authors": ["Yu Meng", "Mengzhou Xia", "Danqi Chen"],
    354       "year": 2024,
    355       "relevance": "Reference-free alignment method with length control, simplifying the DPO framework."
    356     },
    357     {
    358       "title": "Self-rewarding language models",
    359       "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang", "Kyunghyun Cho"],
    360       "year": 2024,
    361       "relevance": "LLM self-evaluation for alignment, using the model as both generator and reward model."
    362     },
    363     {
    364       "title": "Is DPO superior to PPO for LLM alignment? A comprehensive study",
    365       "authors": ["Shusheng Xu", "Wei Fu", "Jiaxuan Gao"],
    366       "year": 2024,
    367       "relevance": "Direct empirical comparison of DPO vs PPO alignment approaches."
    368     },
    369     {
    370       "title": "RLAIF: Scaling reinforcement learning from human feedback with AI feedback",
    371       "authors": ["Harrison Lee", "Samrat Phatale", "Hassan Mansoor"],
    372       "year": 2023,
    373       "relevance": "Google's RLAIF work directly comparing human and AI feedback for alignment."
    374     },
    375     {
    376       "title": "Nash learning from human feedback",
    377       "authors": ["Rémi Munos", "Michal Valko", "Daniele Calandriello"],
    378       "year": 2024,
    379       "relevance": "Game-theoretic approach to alignment using preference models instead of pointwise rewards."
    380     }
    381   ]
    382 }

Impressum · Datenschutz