scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29886B)
      1 {
      2   "paper": {
      3     "title": "TODO: Enhancing LLM Alignment with Ternary Preferences",
      4     "authors": [
      5       "Yuxiang Guo",
      6       "Lu Yin",
      7       "Bo Jiang",
      8       "Jiaqi Zhang"
      9     ],
     10     "year": 2024,
     11     "venue": "International Conference on Learning Representations (ICLR 2025)",
     12     "arxiv_id": "2411.02442",
     13     "doi": "10.48550/arXiv.2411.02442"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "TODO extends the Bradley-Terry model to handle ties in preference data (TOBT), enabling a ternary preference optimization algorithm that consistently outperforms DPO on Mistral-7B and Llama 3-8B across preference modeling accuracy, MT Bench, and six standard benchmarks. The method benefits from a margin parameter α that improves robustness to noisy preference labels, and performs well even on purely binary preference data without any tie labels. Optimal performance is typically achieved at 10-30% tie data ratios, though the best ratio varies across models and benchmarks.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract states 'The implementation details and datasets can be found in https://github.com/XXares/TODO', providing a public GitHub repository URL."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper claims datasets are available at the GitHub URL. Additionally, the primary datasets used (Ultrafeedback, Chatarena) are publicly available, and the paper provides HuggingFace links for model checkpoints and data."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper reports training hyperparameters (Table 10) but does not specify library versions, Python version, PyTorch version, CUDA version, or provide a requirements.txt or Dockerfile."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper provides a GitHub link but does not include step-by-step reproduction instructions, a 'Reproducing Results' section, or specific commands to run in the paper itself."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All result tables (Tables 2-5, 7-8, 12-13) and figures (Figures 2-3) report only point estimates with no confidence intervals, error bars, or ± notation."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims TODO 'outperforms' and 'consistently outperforms' DPO but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). All comparisons are based solely on comparing raw numbers."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Tables 2-5 provide absolute performance numbers for all methods alongside baselines (SFT, DPO), allowing direct computation of effect magnitudes. For example, Table 2 shows DPO at 66.24 average vs TODO at 71.02 at tie ratio 0.0."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The training set size of 20k and test set size of 1500 are stated without justification or power analysis for why these sizes were chosen."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No standard deviations, variance measures, or results across multiple runs are reported anywhere in the paper. All results appear to be from single runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "DPO is the primary baseline throughout. Section 6.4 extends comparisons to KTO, SimPO, and ODPO (Tables 4-5). SFT-only baselines are also included in Tables 2-3."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All baselines are recent: DPO (2023), KTO (2024), SimPO (2024), ODPO (2024). These represent contemporary alignment methods."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper varies the tie data ratio (0, 0.1, 0.2, 0.3) to isolate the effect of tie data. Appendix A.3 (Tables 7-8) varies α across {0.1, 0.2, 0.5, 0.8, 1.2} to study its sensitivity. Section 6.3 analyzes key factors."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluation uses preference test set accuracy, Reward Bench accuracy (5 subcategories), MT Bench scores, and six standard benchmarks (Piqa, ARC-c, ARC-e, MMLU, Hellaswag, Winogrande)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation is automated. MT Bench uses GPT-4-turbo as judge (not humans). Other benchmarks use automated metrics via OpenCompass. No human evaluation of model outputs is performed."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 5.1 describes a curated 1500-sample in-distribution test set separate from training data. Reward Bench serves as an external out-of-distribution test set. Standard benchmarks (MMLU, etc.) are also separate."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Reward Bench results are broken down by category (Chat, ChatHard, Safety, Reasoning, Prior) in Appendix A.12 (Tables 12-13). Individual benchmark results are shown separately in Tables 2-3."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 7-8 in Appendix A.3 show that TODO diverges at α=0.8 and α=1.2 in several settings. Table 2 shows TODO at tie ratio 0.3 (67.37) degrades below the SFT baseline (69.33) for Mistral."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports divergence at high α values (Tables 7-8), and performance degradation at 30% tie ratio for Mistral (Table 2, TODO drops from 71.02 to 67.37). These are genuine negative results showing method limitations."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of consistent outperformance over DPO are supported by Figures 2-3 and Tables 2-5. Claims about Mistral-7B and Llama 3-8B are verified. The claim about binary preference alignment is supported by results at tie ratio 0.0."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('TODO improves alignment') are supported by controlled experiments varying one factor at a time (tie ratio, method) while keeping all other hyperparameters consistent (Section 5.2). The α ablation in Appendix A.3 provides additional controlled evidence."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Enhancing LLM Alignment' suggests general applicability, but results are only on two 7-8B models. The paper does not explicitly bound its claims to these model sizes, architectures, or language settings."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not consider whether the margin α alone (without proper tie handling) could explain the gains on binary data. TODO at tie ratio 0.0 already outperforms DPO, suggesting the margin effect is significant, but this is not discussed as an alternative explanation."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures benchmark accuracy and MT Bench scores but frames these as demonstrating improved 'alignment.' No discussion of whether these proxy metrics actually capture alignment quality, or the gap between benchmark performance and genuine human preference alignment."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model identifiers with HuggingFace URLs are provided: alignment-handbook/zephyr-7b-sft-full, kykim0/llama3-8b-ultrachat-sft. For evaluation: 'gpt-4-turbo-2024-04-09' and 'gpt-4o-2024-05-13' are specified with version dates."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "For MT Bench evaluation with GPT-4 as judge, no actual prompts or scoring instructions are provided. Appendix A.10 says 'we use the default prompt template' from OpenCompass without reproducing the actual text."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 10 (Appendix A.9) lists learning rate, batch size, and β for both models. Additional hyperparameters stated: α=0.5, 3 epochs, Adam optimizer, weight decay 0, cosine learning rate scheduler."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The paper proposes a training loss function (TODO), not an agentic system."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 5.1 and Appendix A.8 describe sampling 20k from Ultrafeedback's 383k pairs, maintaining source distribution (Table 9), and classifying ties based on quality score equality. Tie ratio construction at {0, 0.1, 0.2, 0.3} is explained."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper has no dedicated 'Limitations' or 'Threats to Validity' section. Section 7 (Discussion) discusses future integration possibilities. Section 8 (Conclusion) mentions future work but does not discuss limitations of TODO itself."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed anywhere in the paper. There is no discussion of potential confounds, generalizability concerns, or methodological limitations."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of limitations from testing on only 7-8B models, primarily English tasks, or specific dataset families."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The base datasets (Ultrafeedback, Chatarena) are publicly available with HuggingFace links provided. The paper states constructed datasets are available at the GitHub repository."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 5.1 and Appendix A.8 describe the data source (Ultrafeedback with 383k pairs), sampling procedure, tie classification criteria (identical quality scores from GPT-4), and source distribution (Table 9)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard public preference datasets (Ultrafeedback, Chatarena)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: Ultrafeedback (383k pairs, scored by GPT-4) → sample 20k maintaining source distribution → classify ties by quality score equality → construct datasets at tie ratios {0, 0.1, 0.2, 0.3}. Appendix A.8 provides distribution details."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source or acknowledgments section is present in the paper. Work was done at Meituan Inc. (major Chinese tech company) but no explicit funding disclosure."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly listed: Meituan Inc., Beihang University, and University of Surrey. The footnotes note which work was done during internship/research at Meituan."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. Meituan Inc. (an industry employer of multiple authors) could benefit from improved alignment methods, representing a potential non-independent interest."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not state the training data cutoff dates for the base Mistral-7B or Llama 3-8B pre-trained models, which is relevant since the evaluation benchmarks may have been in the pre-training data."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether the pre-trained models' training data overlaps with evaluation benchmarks (MMLU, ARC, Piqa, Hellaswag, Winogrande), all of which were publicly available before these models were trained."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Evaluation benchmarks like MMLU (2021), ARC (2018), Piqa (2019), and Hellaswag (2019) were all published well before Mistral-7B (2023) and Llama 3 (2024) training. This contamination risk is not discussed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. All experiments are automated model training and benchmark evaluation."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study trains and evaluates LLMs using existing preference datasets."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or API cost information is reported for the trained models or the GPT-4-based evaluation."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No GPU hours, hardware specifications, total API spend, or training time are reported despite fine-tuning 7-8B parameter models."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds. All results appear to be single-run numbers without any seed sensitivity analysis."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. Results are presented without clarifying whether they represent single runs or averages."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "While Appendix A.3 shows α sensitivity analysis across 5 values, no formal hyperparameter search budget (total compute, number of configurations tried for other hyperparameters) is reported."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Appendix A.3 provides theoretical and empirical justification for α=0.5 selection, showing loss tradeoffs (Figure 5), performance across α values (Tables 7-8), and threshold-based criteria for loss balance."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed at all, so the question of multiple comparison correction does not arise."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement TODO and compare against their own implementations of other methods. Self-comparison bias is not acknowledged or mitigated."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "TODO and DPO differ only in their loss function; compute differences are negligible. This comparison is not meaningful here."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses MMLU, ARC, Piqa, Hellaswag, Winogrande, and MT Bench to demonstrate 'alignment' but does not discuss whether these benchmarks actually measure alignment quality or what their construct validity limitations are."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. TODO is a training loss function, not an agentic scaffold."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. Evaluation benchmarks (MMLU 2021, ARC 2018, Piqa 2019, Hellaswag 2019) existed well before model pre-training cutoffs."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether evaluation setups leak information. The perplexity-based evaluation mode (PPL) for multiple-choice tasks is standard but its potential information leakage properties are not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of independence between the training data (Ultrafeedback) and evaluation benchmarks. Potential overlap between Ultrafeedback's source data and benchmark test items is not considered."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are mentioned."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "TODO consistently outperforms DPO in preference modeling accuracy on both in-distribution and out-of-distribution datasets across varying tie data proportions.",
    370       "evidence": "Figure 2 shows higher accuracy for TODO on the 1500-sample test set and Reward Bench across tie ratios {0, 0.1, 0.2, 0.3} for both Mistral-7B and Llama 3-8B. Detailed Reward Bench subcategory scores in Tables 12-13 (Appendix A.12).",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "TODO outperforms DPO on MT Bench across all training sets for both Mistral and Llama 3 models.",
    375       "evidence": "Figures 3a and 3b show TODO scores higher than DPO at all four tie ratios for both model families. Mistral: TODO 6.74 vs DPO 6.46 at ratio 0.0; Llama 3: TODO 6.91 vs DPO 6.73 at ratio 0.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "TODO achieves better average performance than DPO on six standard benchmarks (Piqa, ARC-c, ARC-e, MMLU, Hellaswag, Winogrande) across all tie data ratios.",
    380       "evidence": "Tables 2 and 3 show TODO outperforms DPO at every tie ratio on average. Mistral: TODO ranges 67.37-71.17 vs DPO 65.57-70.78. Llama 3: TODO ranges 70.86-71.13 vs DPO 70.42-70.75.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "TODO shows strong results in binary preference alignment (without tie data), outperforming DPO even at tie ratio 0.0.",
    385       "evidence": "Tables 2-3 at tie ratio 0.0: Mistral TODO 71.02 vs DPO 66.24; Llama 3 TODO 71.03 vs DPO 70.75. MT Bench: Mistral TODO 6.74 vs DPO 6.46; Llama 3 TODO 6.74 vs DPO 6.69.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Strategic incorporation of tie data enhances alignment capabilities when used with TODO.",
    390       "evidence": "Best performance on six benchmarks: Mistral at 20% tie ratio (71.17), Llama 3 at 30% (71.13). MT Bench: Llama 3 peaks at 20% tie ratio (6.91). However, Mistral MT Bench peaks at 0% tie ratio (6.74), and 30% tie ratio degrades Mistral significantly (67.37).",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "TODO outperforms other binary alignment methods (KTO, SimPO, ODPO) when tie data is present.",
    395       "evidence": "Tables 4-5 compare methods on Ultrafeedback and Chatarena datasets. With tie data, TODO achieves highest test accuracy (78.47 on Chatarena) and MT Bench (5.96 on Ultrafeedback with ties, 5.83 on Chatarena with ties).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or statistical tests",
    402       "detail": "All comparisons across 12+ experimental conditions are based solely on comparing point estimates from apparently single runs. Many improvements are small (e.g., <1% on Llama 3 benchmarks in Table 3), making it impossible to distinguish signal from noise without variance estimates."
    403     },
    404     {
    405       "flag": "No limitations section",
    406       "detail": "The paper has no dedicated limitations, threats to validity, or scope boundaries discussion. For an ICLR paper, this is a notable omission that obscures the method's known weaknesses and boundaries."
    407     },
    408     {
    409       "flag": "Inconsistent optimal tie ratio",
    410       "detail": "The 'optimal' tie data ratio varies across models and benchmarks: Mistral MT Bench peaks at 0%, Mistral six-benchmarks at 20%, Llama 3 MT Bench at 20%, Llama 3 six-benchmarks at 30%. This undermines the practical guidance about how much tie data to use."
    411     },
    412     {
    413       "flag": "Missing compute and resource information",
    414       "detail": "No GPU hours, hardware specifications, or training times are reported for fine-tuning 7-8B parameter models, making practical cost comparison impossible."
    415     },
    416     {
    417       "flag": "Margin effect confounded with tie handling",
    418       "detail": "TODO at tie ratio 0.0 (no tie data) already outperforms DPO, suggesting the margin α alone provides substantial benefit. The paper does not disentangle the contribution of the margin from the tie-handling mechanism, making it unclear which component drives the gains."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    424       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D. Manning", "Chelsea Finn"],
    425       "arxiv_id": "2305.18290",
    426       "relevance": "Core baseline method — DPO is the foundational binary preference alignment algorithm that TODO extends."
    427     },
    428     {
    429       "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
    430       "authors": ["Kawin Ethayarajh", "Winnie Xu", "Niklas Muennighoff", "Dan Jurafsky", "Douwe Kiela"],
    431       "year": 2024,
    432       "relevance": "Alternative alignment method compared against TODO; uses prospect theory instead of BT model for preference optimization."
    433     },
    434     {
    435       "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward",
    436       "authors": ["Yu Meng", "Mengzhou Xia", "Danqi Chen"],
    437       "year": 2024,
    438       "arxiv_id": "2405.14734",
    439       "relevance": "Reference-free alignment baseline compared against TODO in Section 6.4."
    440     },
    441     {
    442       "title": "Direct Preference Optimization with an Offset",
    443       "authors": ["Afra Amini", "Tim Vieira", "Ryan Cotterell"],
    444       "year": 2024,
    445       "arxiv_id": "2402.10571",
    446       "relevance": "ODPO introduces reward-difference margin to DPO; compared against TODO and discussed as integration candidate in Section 7."
    447     },
    448     {
    449       "title": "Proximal Policy Optimization Algorithms",
    450       "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal", "Alec Radford", "Oleg Klimov"],
    451       "arxiv_id": "1707.06347",
    452       "relevance": "Foundational RL algorithm used in RLHF alignment pipelines that TODO aims to improve upon."
    453     },
    454     {
    455       "title": "Zephyr: Direct Distillation of LM Alignment",
    456       "authors": ["Lewis Tunstall", "Edward Beeching", "Nathan Lambert"],
    457       "arxiv_id": "2310.16944",
    458       "relevance": "Provides the Ultrafeedback-binarized dataset and zephyr-sft-full model used as the Mistral baseline in TODO experiments."
    459     },
    460     {
    461       "title": "Constitutional AI: Harmlessness from AI Feedback",
    462       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    463       "year": 2022,
    464       "relevance": "Foundational work on AI feedback for alignment, relevant to preference data quality and LLM-as-judge approaches used in TODO's training data."
    465     },
    466     {
    467       "title": "ORPO: Monolithic Preference Optimization without Reference Model",
    468       "authors": ["Jiwoo Hong", "Noah Lee", "James Thorne"],
    469       "arxiv_id": "2403.07691",
    470       "relevance": "Reference-free alignment alternative that eliminates the reference model requirement from DPO-style methods."
    471     },
    472     {
    473       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    474       "authors": ["Nathan Lambert", "Valentina Pyatkin", "Jacob Morrison"],
    475       "arxiv_id": "2403.13787",
    476       "relevance": "Out-of-distribution evaluation benchmark used to assess TODO's preference modeling accuracy."
    477     },
    478     {
    479       "title": "UltraFeedback: Boosting Language Models with High-Quality Feedback",
    480       "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"],
    481       "year": 2023,
    482       "relevance": "Primary preference dataset used for TODO training and evaluation; contains GPT-4-scored response pairs with quality scores enabling tie identification."
    483     },
    484     {
    485       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    486       "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"],
    487       "year": 2024,
    488       "relevance": "Human-labeled preference platform providing the Chatarena dataset used for TODO evaluation; contains 29.4% naturally occurring ties."
    489     },
    490     {
    491       "title": "Insights into Alignment: Evaluating DPO and its Variants Across Multiple Tasks",
    492       "authors": ["Amir Saeidi", "Shivanshu Verma", "Chitta Baral"],
    493       "arxiv_id": "2404.14723",
    494       "relevance": "Evaluates DPO variants across tasks; provides context for understanding alignment method performance differences."
    495     },
    496     {
    497       "title": "Direct Alignment of Language Models via Quality-Aware Self-Refinement",
    498       "authors": ["Runsheng Yu", "Yong Wang", "Xiaoqi Jiao", "Youzhi Zhang", "James T. Kwok"],
    499       "year": 2024,
    500       "arxiv_id": "2405.21040",
    501       "relevance": "Concurrent work using intrinsic knowledge constraints for preference alignment, discussed as compatible with TODO in Section 7."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Practitioners doing RLHF/DPO alignment can adopt TODO as a drop-in loss function replacement; code is released on GitHub."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "The idea that ties in preference data matter is reasonable but unsurprising; it's an incremental extension to the well-known BT model rather than a paradigm shift."
    512     },
    513     "fear_safety": {
    514       "score": 0,
    515       "justification": "No safety or security concerns raised; the paper is about improving alignment quality."
    516     },
    517     "drama_conflict": {
    518       "score": 0,
    519       "justification": "No controversy or dramatic claims; a straightforward methodological improvement paper."
    520     },
    521     "demo_ability": {
    522       "score": 2,
    523       "justification": "Code released on GitHub with datasets; implementable by those with GPU access for fine-tuning 7-8B models."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "Meituan is a major Chinese tech company but not globally prominent in AI research; published at ICLR which carries prestige."
    528     }
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs