scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28926B)
      1 {
      2   "paper": {
      3     "title": "Spread Preference Annotation: Direct Preference Judgment for Efficient LLM Alignment",
      4     "authors": [
      5       "Dongyoung Kim",
      6       "Kimin Lee",
      7       "Jinwoo Shin",
      8       "Jaehyung Kim"
      9     ],
     10     "year": 2024,
     11     "venue": "International Conference on Learning Representations (ICLR 2025)",
     12     "arxiv_id": "2406.04412"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "SPA achieves a 21.13% win rate on AlpacaEval 2.0 using only 3.3% of ground-truth preference labels from UltraFeedback, outperforming both standard DPO with the same data (7.68%) and Zephyr-7b-β trained on 100% of labels (10.03%). The direct preference judgment from model logits outperforms external reward models (PairRM) and LLM-as-judge baselines, and the self-refinement with decoupled noise detection contributes meaningful gains. The framework generalizes across Mistral-7B, Phi-2, LLaMA-3, and Phi-3 models, and can function even without seed preference data.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Footnote 1 provides a GitHub URL: https://github.com/kingdy2002/SPA. The Reproducibility Statement also mentions code and checkpoint release."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: UltraFeedback ('argilla/ultrafeedback-binarized-preferences-cleaned', footnote 14) and UltraChat (footnote 11). All training data is from public HuggingFace datasets."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Appendix B.4 mentions '4 A6000 GPUs' but no requirements.txt, Dockerfile, conda environment file, or detailed library version specifications are provided in the paper."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided in the paper. The Reproducibility Statement says 'we will release our codes and the checkpoints' but does not include specific commands or a reproducing-results guide."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All main results in Tables 1-3 and 5-7 report only point estimates (e.g., '21.13%', '15.39%') with no confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims SPA 'outperforms' and 'significantly improves' baselines based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, bootstrap tests) are reported."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are reported with baseline context throughout: e.g., 'win rate improved from 7.68% to 21.13%' (Table 1), 'length-control win rate from 7.58% to 15.39%', and improvement from 4.72% to 21.13% vs GPT-4."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for the choice of 2K seed samples (3.3%), or the 8K/20K/30K iteration split sizes. No power analysis or discussion of whether these sizes are sufficient."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Main results (Tables 1-2) appear to be single-run numbers with no variance or standard deviation reported. Table 4 reports variance across 3 different seed data samplings, but this is a sensitivity analysis, not multi-run variance for the primary results."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 2 compares against Iterative DPO with PairRM (external reward model) and Iterative DPO with LLM-as-judge. Table 1 compares against DPO and Zephyr-7b-β."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "PairRM (Jiang et al., 2023b) and Iterative DPO (Snorkel, 2024; Xu et al., 2023) are recent and competitive baselines. PairRM is described as showing 'state-of-art performance in AlpacaEval2.0 benchmark.'"
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 6 presents ablations removing self-refinement (SR) and decoupled noise detection (DND), showing their individual contributions. Table 7 ablates the reference model choice."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper evaluates on AlpacaEval 2.0 (both raw win rate and length-controlled win rate) and MT-Bench (average score across 8 categories), providing multiple complementary metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All evaluation is automated: AlpacaEval 2.0 uses GPT-4 as judge, MT-Bench uses GPT-4 scoring. No human evaluation of outputs is conducted, despite the paper claiming to improve 'alignment with human preferences.'"
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "AlpacaEval 2.0 (805 instructions) and MT-Bench are standard held-out evaluation benchmarks entirely separate from the UltraFeedback training data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Figure 5a provides MT-Bench task-wise evaluation across 8 categories (Writing, Roleplay, Reasoning, Math, Coding, Extraction, STEM, Humanities)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section C and Figure 5a discuss that SPA shows 'almost no gain in Coding and degradation in Math.' Appendix A.1 discusses the length bias limitation. Table 10 quantifies the length increase."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports degradation in Math tasks and no gain in Coding (Figure 5a). It also reports that self-refinement without decoupled noise detection yields only marginal improvement (Table 6: 19.91% to 19.94%)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims 'superior alignment performance on AlpacaEval 2.0 with only 3.3% of the ground-truth preference labels' which is supported by Table 1 (21.13% vs 10.03% for Zephyr-7b-β with 100% labels)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims ('SPA improves alignment') and supports them with controlled ablation studies (Table 6) that isolate the effect of each component (data expansion, self-refinement, decoupled noise detection) through single-variable removal."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The abstract claims the framework is 'highly competitive and practical for real-world applications' but all evaluation is on automated benchmarks (AlpacaEval, MT-Bench) with GPT-4 as judge. The title 'Efficient LLM Alignment' is broad relative to the tested settings."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5.2 discusses distribution shift as an alternative explanation for why PairRM underperforms. Appendix D specifically addresses the length bias confound with Table 10 showing length-regularized variants, and LC win rate is reported throughout to account for response length."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures AlpacaEval 2.0 win rate (GPT-4 as judge) and MT-Bench scores and frames these as 'alignment with human preferences.' The gap between automated evaluation by GPT-4 and actual human preference is not discussed — the paper uses 'AlpacaEval 2.0 approximately evaluates human preference' without deeper analysis."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model identifiers are provided: 'Mistral-7b-0.1' with HuggingFace path 'alignment-handbook/zephyr-7b-sft-full' (footnote 6), 'lole25/phi-2-sft-ultrachat-full' (footnote 8), and HuggingFace links for LLaMA-3-8B-Instruct and Phi-3-medium-4k-instruct."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Listing 1 provides the actual prompt template used for preference judgment, and Listing 2 provides a complete evaluation instruction example with filled values, both in Appendix B.2."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 5.1 reports comprehensive hyperparameters: β=0.1, batch size 32, learning rate 5×10⁻⁷, α=0.1, K=10, λ values (1/2, 1/4, 1/8), temperature 0.7, optimizer (AdamW), cosine scheduler with 10% warmup."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. SPA is a training framework using DPO with self-generated preference data, not an agent-based system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 5.1 documents the data pipeline: 2K seed samples (3.3% of 60K) with gold labels, remaining split into 8K/20K/30K for iterations 1/2/3 with only prompts. Appendix B.4 documents response generation with max 300 token limit and SMA-based confidence measurement."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix A.1 'Limitation and Future Work' provides a dedicated discussion of limitations, specifically addressing the length bias tendency and suggesting future directions."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Appendix A.1 discusses a specific threat: SPA tends to increase response length, and acknowledges 'depending on the user, this behavior could be dispreferred.' Figure 5a identifies specific task categories (Coding, Math) where SPA fails to improve."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. It does not bound generalizations to specific conditions beyond noting length bias and coding/math limitations."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "While UltraFeedback is publicly available, the specific 2K seed samples, generated responses, self-annotated preference labels, and noise identification data are not released for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 5.1 describes data sourcing from UltraFeedback ('argilla/ultrafeedback-binarized-preferences-cleaned'), the seed data sampling procedure, and response generation parameters (temperature 0.7, 2 responses per prompt)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data comes from standard public datasets (UltraFeedback, UltraChat)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented: UltraFeedback → seed/iteration split → response generation from current model → preference labeling via logits → noise detection → DPO training. Each step is described in Sections 4 and 5.1."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Acknowledgments section discloses funding from IITP grants funded by the Korean government (MSIT), with specific grant numbers (No.2021-0-02068 and RS-2022-II220959)."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: KAIST and Yonsei University. The paper does not evaluate any commercial product from these institutions."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Funding is from Korean government agencies (IITP/MSIT), which have no financial stake in the specific outcome of this LLM alignment research."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper does not state the training data cutoff for Mistral-7b-0.1v or any other base model used. No discussion of when the models' pre-training data was collected."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether AlpacaEval 2.0 instructions or MT-Bench questions could have appeared in the Mistral base model's pre-training data or in the UltraFeedback/UltraChat fine-tuning data."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "AlpacaEval and MT-Bench were publicly available before Mistral's training. No contamination analysis or discussion of whether evaluation data may have leaked into pre-training corpora."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. All evaluation is automated."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix B.4 reports wall-clock timing: 'generating responses for 10K prompts takes approximately 1 to 2 hour, and preference judging for generated responses also takes about 1 to 2 hour. For training... it takes about 1 to 2 hours per epoch.'"
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Appendix B.4 states: '4 A6000 GPUs' and 'the total time required to complete response generation, preference judgment, and one epoch of training was between 5 to 6 hours per 10K prompt.'"
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Table 4 reports results across 3 different random samplings of the initial seed preference dataset, showing variance. This addresses sensitivity to the seed data, though not random seed sensitivity in training."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The main results (Tables 1-2) do not state how many experimental runs produced them. Table 4 uses 3 seed data samplings for sensitivity analysis, but the primary results lack this information."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters are reported (β=0.1, α=0.1, K=10, etc.) but no search budget, number of configurations tried, or search method is described."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper reports fixed hyperparameter values without explaining how they were selected or whether they were tuned on a validation set."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement baselines (Iterative DPO with PairRM, LLM-as-judge) and compare against their own method without acknowledging that author-implemented baselines may systematically underperform."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "SPA requires 3 iterations of response generation plus training, consuming substantially more compute than single-stage DPO baselines. This compute difference is not discussed as a potential confound for the observed improvements."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses AlpacaEval 2.0 (GPT-4 as judge) and MT-Bench as proxies for 'alignment with human preferences' without questioning whether these automated metrics actually capture the claimed construct."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved in the comparisons. All methods differ only in training procedure."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether AlpacaEval 2.0 or MT-Bench tasks existed before the Mistral model's training data collection period."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether any evaluation setup provides information that would not be available in real usage scenarios."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No analysis of whether UltraFeedback training prompts share structural similarities with AlpacaEval 2.0 or MT-Bench evaluation prompts."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap, or decontamination pipeline)."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "SPA achieves 21.13% win rate on AlpacaEval 2.0 vs GPT-4 using only 3.3% of ground-truth preference labels, compared to 7.68% for standard DPO with the same data.",
    369       "evidence": "Table 1 reports SPA at 21.13% win rate and 15.39% LC win rate vs DPO at 7.68% and 9.03% respectively, both using 3.3% gold labels from UltraFeedback.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "SPA with 3.3% labeled data outperforms Zephyr-7b-β trained on 100% of UltraFeedback labels on AlpacaEval 2.0.",
    374       "evidence": "Table 1 shows SPA (21.13% WR, 15.39% LC WR) vs Zephyr-7b-β (10.03% WR, 11.75% LC WR). Both use the same base model and SFT data.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Direct preference judgment from model logits outperforms external reward models (PairRM) and LLM-as-judge for iterative preference learning.",
    379       "evidence": "Table 2 shows SPA at 21.13%/15.39% vs PairRM at 9.46%/11.87% and LLM-as-judge at 9.18%/9.28% on AlpacaEval 2.0. Figure 3 shows the gap widens across iterations.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "SPA can improve alignment even without seed preference data.",
    384       "evidence": "Figure 4 shows Mistral-7B-instruct-v0.1 improving from 6.31% to 9.79% win rate and from 10.14% to 11.59% LC win rate when applying SPA without seed data.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "Self-refinement with decoupled noise detection significantly improves SPA's performance.",
    389       "evidence": "Table 6 ablation: data expansion alone achieves 14.41% LC WR; adding SR gives 14.7%; adding both SR and DND gives 15.39%. Win rate improvement is more substantial (19.91% → 21.13%).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "SPA generalizes across different LLM architectures and sizes.",
    394       "evidence": "Table 5 shows improvements across Phi-2-2.7B (5.67%→9.43% WR), LLaMA-3-8B (25.39%→34.84%), and Phi-3-14B (22.12%→24.14%).",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Length inflation confound",
    401       "detail": "Table 10 shows SPA-trained models generate substantially longer responses (2749 chars at iter 2 vs 901 for SFT). The raw win rate gains (4.72%→21.13%) are much larger than LC win rate gains (7.58%→15.39%), suggesting response length drives a significant portion of the improvement. While LC win rate is reported, the paper leads with the more impressive raw numbers."
    402     },
    403     {
    404       "flag": "No significance tests for any comparison",
    405       "detail": "All claims of 'significantly outperforms' and 'superior performance' rest on comparing point estimates without statistical tests. The largest claimed difference (SPA 21.13% vs PairRM 9.46%) could potentially be due to random variation, since no error bars or significance tests are reported for the main results."
    406     },
    407     {
    408       "flag": "Uncontrolled compute budget",
    409       "detail": "SPA uses 3 iterations of response generation + preference labeling + DPO training (15-18 GPU-hours on 4 A6000s), while the DPO baseline uses a single training phase. Performance improvements may partly reflect the additional compute rather than the method's novelty."
    410     },
    411     {
    412       "flag": "All evaluation relies on GPT-4 as judge",
    413       "detail": "For a paper about improving 'alignment with human preferences,' all evaluation uses GPT-4 as the preference judge. No human evaluation is conducted. GPT-4 judge preferences may not align well with actual human preferences, particularly given the known length bias of LLM judges."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    419       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D Manning", "Stefano Ermon", "Chelsea Finn"],
    420       "year": 2023,
    421       "relevance": "Core method that SPA builds upon — directly optimizing LLM preferences without separate reward modeling."
    422     },
    423     {
    424       "title": "Training language models to follow instructions with human feedback",
    425       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    426       "year": 2022,
    427       "relevance": "Foundational RLHF paper establishing the preference learning framework for LLM alignment."
    428     },
    429     {
    430       "title": "Self-rewarding language models",
    431       "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang", "Kyunghyun Cho"],
    432       "year": 2024,
    433       "arxiv_id": "2401.10020",
    434       "relevance": "Related approach using LLM-as-judge for self-improvement in alignment, which SPA's direct preference judgment aims to outperform."
    435     },
    436     {
    437       "title": "Constitutional AI: Harmlessness from AI Feedback",
    438       "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"],
    439       "year": 2022,
    440       "arxiv_id": "2212.08073",
    441       "relevance": "Influential approach to AI alignment using AI feedback instead of human feedback."
    442     },
    443     {
    444       "title": "Zephyr: Direct Distillation of LM Alignment",
    445       "authors": ["Lewis Tunstall", "Edward Beeching", "Nathan Lambert"],
    446       "year": 2023,
    447       "arxiv_id": "2310.16944",
    448       "relevance": "Key baseline using the same model architecture and SFT recipe, trained on full UltraFeedback data."
    449     },
    450     {
    451       "title": "UltraFeedback: Boosting Language Models with High-Quality Feedback",
    452       "authors": ["Ganqu Cui", "Lifan Yuan", "Ning Ding"],
    453       "year": 2023,
    454       "arxiv_id": "2310.01377",
    455       "relevance": "Source of the preference dataset used as training data in all SPA experiments."
    456     },
    457     {
    458       "title": "KTO: Model Alignment as Prospect Theoretic Optimization",
    459       "authors": ["Kawin Ethayarajh", "Winnie Xu", "Niklas Muennighoff", "Dan Jurafsky", "Douwe Kiela"],
    460       "year": 2024,
    461       "arxiv_id": "2402.01306",
    462       "relevance": "Alternative preference learning algorithm that removes reliance on pairwise labels, relevant to efficient alignment."
    463     },
    464     {
    465       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    466       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    467       "year": 2023,
    468       "relevance": "Defines MT-Bench evaluation used in this paper and establishes the LLM-as-judge paradigm that SPA compares against."
    469     },
    470     {
    471       "title": "SimPO: Simple Preference Optimization with a Reference-Free Reward",
    472       "authors": ["Yu Meng", "Mengzhou Xia", "Danqi Chen"],
    473       "year": 2024,
    474       "relevance": "Reference-free preference optimization method relevant to efficient LLM alignment approaches."
    475     },
    476     {
    477       "title": "Bootstrapping Language Models with DPO Implicit Rewards",
    478       "authors": ["Changyu Chen", "Zichen Liu", "Chao Du"],
    479       "year": 2025,
    480       "relevance": "Concurrent work proposing similar iterative self-improvement idea with focus on length regularization."
    481     },
    482     {
    483       "title": "Self-play Preference Optimization for Language Model Alignment",
    484       "authors": ["Yue Wu", "Zhiqing Sun", "Huizhuo Yuan"],
    485       "year": 2024,
    486       "arxiv_id": "2405.00675",
    487       "relevance": "Related self-play approach for preference optimization in LLM alignment."
    488     },
    489     {
    490       "title": "Deep reinforcement learning from human preferences",
    491       "authors": ["Paul F Christiano", "Jan Leike", "Tom Brown"],
    492       "year": 2017,
    493       "relevance": "Foundational RLHF paper establishing learning from human preferences framework."
    494     }
    495   ],
    496   "engagement_factors": {
    497     "practical_relevance": {
    498       "score": 2,
    499       "justification": "Code released and method requires only 'a few lines of additional code to the original DPO codebase,' making it practically adoptable for teams doing LLM alignment with limited annotation budget."
    500     },
    501     "surprise_contrarian": {
    502       "score": 1,
    503       "justification": "Shows 3.3% of labeled data can outperform 100% with the right framework, which is somewhat surprising but follows a known pattern of iterative self-improvement methods."
    504     },
    505     "fear_safety": {
    506       "score": 0,
    507       "justification": "No safety or security concerns raised; the paper is about efficient alignment, not attacks or vulnerabilities."
    508     },
    509     "drama_conflict": {
    510       "score": 0,
    511       "justification": "No controversy or conflict with existing claims or institutions."
    512     },
    513     "demo_ability": {
    514       "score": 1,
    515       "justification": "Code is on GitHub but requires significant GPU resources (4 A6000s) and multi-hour training to reproduce; not a quick demo."
    516     },
    517     "brand_recognition": {
    518       "score": 1,
    519       "justification": "KAIST is well-known in ML research and ICLR is a top venue, but the authors and lab are not household names."
    520     }
    521   }
    522 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs