ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32212B)


      1 {
      2   "paper": {
      3     "title": "Inference time LLM alignment in single and multidomain preference spectrum",
      4     "authors": [
      5       "Sadat Shahriar",
      6       "Zheng Qi",
      7       "Nikolaos Pappas",
      8       "Srikanth Doss",
      9       "Monica Sunkara",
     10       "Kishaloy Halder",
     11       "Manuel Mager",
     12       "Yassine Benajiba"
     13     ],
     14     "year": 2024,
     15     "venue": "arXiv (submitted to ICLR 2025)",
     16     "arxiv_id": "2410.19206",
     17     "doi": "10.48550/arXiv.2410.19206"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Alignment Vectors (AVs), obtained by subtracting base model weights from DPO-aligned model weights, enable inference-time preference tunability by varying a scalar lambda coefficient. Single-domain experiments on Mistral-7B across medical, financial, and legal domains show that lambda can shift responses between avoidance, generic, and expert proficiency levels. Multidomain AV arithmetic achieves 22 of 27 possible domain-behavior combinations, 12x faster than joint training, though with substantially lower accuracy (e.g., 46% vs 100%). AVs transfer across fine-tuning stages of the same model family, maintaining safety alignment while adjusting domain proficiency.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No code repository link is provided. The paper makes no mention of releasing source code for the alignment vector method or training pipeline."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper states 'The dataset will be available through this link' (Section 1) but no actual URL is provided in the paper text. A promise of future release counts as NO."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions 'NVIDIA A100 GPUs' and 'Mistral-7B-Instruct-v0.3' but provides no requirements.txt, Dockerfile, library versions, or detailed environment specifications."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but lacks runnable scripts or commands."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No confidence intervals or error bars are reported anywhere. Tables 1-5 present only point estimates of preference accuracy and GPT-4 judged accuracy."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No statistical significance tests are used. All comparisons between methods (prompting vs model editing vs joint training) are based solely on comparing raw accuracy numbers without any statistical testing."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The main accuracy results are presented as raw numbers without computed effect sizes. Speed comparisons (12x faster, half inference cost) provide context for efficiency claims, but the core performance comparisons lack formal effect sizes."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification given for the 38k synthetic queries or the 80/20 train/test split. No power analysis or sample size rationale is discussed."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Results appear to be from single training runs."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Three baselines are included: 'default' (no alignment, no prompting), 'prompting' (instruction-augmented queries), and 'joint training' (training on combined multi-domain data). These are defined in Section 5.2."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The paper discusses several contemporary inference-time alignment methods (ITI, DeAl, SteerLM, CPO, InferAligner) in Section 2 but does not benchmark against any of them. Only prompting and joint training are compared. The related work argues these methods have different objectives, but no empirical comparison is provided."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No ablation study is conducted. The paper does not test alternative AV extraction methods, different loss functions, or component removal. Varying lambda explores parameter sensitivity but is not an ablation."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Two metrics are used: 'preference accuracy' (log-probability based, Section 5.1) and 'GPT-4 judged generation accuracy' (Section 5.1). Both are reported in Tables 1-3."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Human evaluation in Section 3.3 assesses the quality of the synthetic training data, not the system's outputs. The schema requires humans to evaluate what the system produced. System evaluation uses only automated metrics (preference accuracy and GPT-4 judging)."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.3 states: 'We utilize 80% of the generated data in each domain for training and 20% for testing. For the validation process, we allocated 3% of the data for training time validation.'"
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down by domain (Medical, Financial, Legal in Tables 1-4), by proficiency level (Exp, Gen, Avd), and by lambda value. Table 4 further shows cross-domain generalization effects."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 6.2 notes that only 22/27 multidomain combinations work. Section 6.3 discusses that negative lambda for safety produces 'inconsistent and mixed' scores and the model doesn't become fully unsafe. Section 6.2 discusses over-generalization as a failure mode."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative results are reported: 5/27 multidomain combinations fail to achieve desired behavior (Section 6.2), negative lambda for safety doesn't achieve full unsafe behavior (Section 6.3), and over-generalization compromises multidomain precision (Section 6.2)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims — dynamically adjustable behavior via AVs (Tables 1-3), inference cost reduction vs prompting (Section 6.1), AV transferability (Section 6.4, Figure 4), 12x speed advantage over retraining (Section 6.2) — are all supported by experimental results in the paper, though with caveats about accuracy trade-offs."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper's causal claims ('adjusting λ controls proficiency level') are supported by controlled single-variable manipulation: lambda is varied while other factors are held constant, and the resulting behavior changes are observed (Tables 1-3, Figure 2). This controlled interventionist design is adequate for the claims made."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 7 explicitly bounds generalization: 'we tested our approach only on Mistral-7b, so validation with other open-source LLMs and SLMs is necessary' and 'our method works only for LLMs with the same architecture.' The title focuses on 'preference spectrum' rather than making universal claims."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "No alternative explanations are considered. The paper does not discuss whether negative-lambda 'avoidance' behavior is genuine preference reversal or model degradation, whether cross-domain generalization reflects general capability changes rather than domain-specific effects, or whether the preference accuracy metric could create artifacts."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper uses two complementary metrics — log-probability-based preference accuracy and GPT-4 judged generation accuracy on actual outputs — bridging the gap between internal model behavior and observable output quality. The claims match the granularity of these measurements (proficiency level control, not broader claims about domain expertise)."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Mistral-7B-Instruct-v0.3 is fully specified (Section 5.3). However, 'Claude-3-Sonnet' for data generation lacks a snapshot date, and 'GPT-4' for evaluation judging is stated without any version identifier (Section 5.1, referencing Zheng et al., 2024)."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix B provides actual prompt text for CreatePersona and PersonaHub query generation methods, and Appendix C provides the full instructions for generating avoidance, generic, and expert responses, including the actual prompt text used."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "DPO training reports beta=0.1, batch size 4, one epoch (Section 5.3). However, learning rate is not stated, and temperature/top-p/sampling settings for Claude-3-Sonnet (data generation) and GPT-4 (evaluation) API calls are not reported."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The method is a direct model editing approach (weight arithmetic) without agent pipelines, tools, or multi-step scaffolding."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3 documents the full data generation pipeline: persona sampling from PersonaHub (7,000 from 200,000), CreatePersona hierarchical generation, response generation at three levels, cleanup ('truncation and reformatting'), with final counts per domain (Table 6 in Appendix A)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 'Limitations and Future Work' provides substantive discussion of four specific limitations: basic AV extraction approach, architecture constraint, single model testing, and grid search for multidomain alignment."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Limitations are specific to this study: 'we tested our approach only on Mistral-7b' (single model threat), 'our method works only for LLMs with the same architecture' (structural constraint), 'basic approach for obtaining alignment vectors' with concrete alternatives named (SVD, thresholding)."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 7 explicitly states boundaries: only tested on Mistral-7b, only works for same-architecture models, relied on grid search for multidomain. Section 1 also bounds scope to three domains and three proficiency levels."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data is available. The paper promises dataset availability 'through this link' but no functional URL is provided."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 3 describes data collection in detail: PersonaHub sampling (7,000 personas from 200,000), CreatePersona hierarchical generation (5 initial pairs, 3 randomization rounds), response generation with three-level instructions, and final counts per domain (Table 6)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "Three human annotators are used for data quality evaluation (Section 3.3) but no information is provided about who they are, how they were recruited, their qualifications, or whether they have domain expertise (medical/financial/legal)."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from personas → queries → three-level responses → train/test split is documented with counts at each stage. Figure 1 provides a visual overview. Table 6 gives final data amounts per domain and method."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure or acknowledgments section is present. The work was done at AWS AI Labs (most authors) and during an internship (first author), but no explicit funding statement is made."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: University of Houston (first author, internship) and AWS AI Labs (remaining authors). The footnote states 'Work done during internship at AWS AI Labs.'"
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "AWS AI Labs (an Amazon subsidiary) has a commercial interest in LLM alignment methods for their products. While the paper uses Mistral (not an AWS model), the alignment technique could benefit AWS's LLM offerings. No funding independence is discussed."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is included in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The evaluation uses custom synthetic data generated specifically for this study via Claude-3-Sonnet, not pre-existing benchmarks. The model is fine-tuned via DPO and evaluated on held-out synthetic data that would not exist in Mistral's pre-training corpus."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not applicable — the paper evaluates on custom synthetic data generated for this study, not on pre-existing benchmarks where pre-training contamination would be a concern."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Not applicable — no pre-existing benchmark is used for evaluation. All evaluation data is synthetically generated for this study."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "The paper does not conduct a human subjects study. The three annotators in Section 3.3 perform data quality validation, not participation in a study of human behavior."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study is conducted. Annotators assess synthetic data quality."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study is conducted."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study is conducted."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study is conducted."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects study is conducted."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human subjects study is conducted."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 6.2 reports: 'each evaluation takes around 60 seconds.' The abstract and Section 6.1 claim inference cost is reduced by half vs prompting. Section 6.2 computes total grid search time (155 hours) vs joint training time (1,944 hours)."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 6.2 states: 'each job, along with its corresponding validation runs, takes about 72 hours of training on A100 GPUs.' Total comparisons: 1,944 GPU-hours for joint training vs 155 hours for grid search. Hardware specified as NVIDIA A100 GPUs (Section 5.3)."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds. Results appear to be from single training runs with no seed sensitivity analysis."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is not explicitly stated. The paper says 'trained each model for one epoch' but does not state whether this was repeated across runs."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "For multidomain, the grid search space is described (21^3 = 9,261 evaluations, Section 6.2). However, no hyperparameter search budget is reported for the DPO training itself — beta=0.1 is stated without justifying how it was selected."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The lambda values shown in tables appear to be selected to demonstrate desired behaviors (Exp, Gen, Avd) without clear documentation of how these specific values were chosen. For DPO, beta=0.1 is stated without selection justification."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement all baselines (prompting, joint training) themselves and do not acknowledge any potential bias from self-implementation. No independent evaluation is conducted."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "While the paper compares total training time (1,944 vs 155 hours), it does not systematically report accuracy as a function of compute. Joint training achieves near-perfect accuracy at 12x more compute, but this trade-off is not formally analyzed."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether its preference accuracy metric (based on log-probabilities) truly measures proficiency level control, or whether the synthetic three-level data represents meaningful real-world proficiency distinctions."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding is involved in this model editing approach."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. While the synthetic test data was generated for this study, the paper does not address whether Mistral's pre-training data contains similar domain-specific Q&A patterns that could inflate baseline performance."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Not discussed. The evaluation setup provides all three response levels for log-probability comparison, and the format closely mirrors training data structure. Whether this format matching introduces leakage is not addressed."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Not discussed. Train and test data are generated from the same persona distributions using the same LLM (Claude-3-Sonnet). Potential structural similarities between splits are not analyzed."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention methods are applied. No deduplication analysis, overlap checks, or contamination tests between train and test sets are mentioned."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Alignment Vectors act as tunable knobs, enabling users to control proficiency levels across a spectrum by adjusting lambda",
    374       "evidence": "Tables 1-3 and Figure 2 show that varying lambda shifts the dominant behavior between Expert, Generic, and Avoidance across medical, financial, and legal domains. E.g., medical domain: lambda=0.5 achieves 95% expert preference accuracy, lambda=-0.7 shifts to generic (44%), lambda=-1.2 shifts to avoidance (84%).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The model editing approach reduces inference cost by half compared to prompt engineering",
    379       "evidence": "Section 6.1 states 'instruction-augmented prompts are nearly as long as the original queries, which doubles the inference cost.' This is a rough estimate based on prompt length, not direct inference cost measurement.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Multidomain diverse preference is achievable via AV arithmetic, 12x faster than joint training",
    384       "evidence": "Section 6.2: grid search (155 hours) vs joint training (1,944 hours). Table 5 shows 22/27 multidomain combinations achieve desired dominant behavior. However, accuracy is substantially lower than joint training (e.g., 46% vs 100% for medical avoidance).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "AVs are transferable across different fine-tuning stages of the same model",
    389       "evidence": "Section 6.4 and Figure 4 show that proficiency AVs can be applied to a safety-aligned version of Mistral-7B, achieving 0.81 change in medical expert accuracy while safety accuracy changes by only 0.11. Demonstrated on one model variant only.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Model editing does not lead to significant regression in general domain performance",
    394       "evidence": "Table 4 shows safety preference accuracy changes within ±12% and helpfulness within ±18% across lambda values. However, 18% change in helpfulness could be considered significant, and these claims are not statistically tested.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "Single-domain AVs generalize to other specialized domains",
    399       "evidence": "Table 4(a-c) shows that adding a domain-specific AV (e.g., medical) also shifts preference accuracy in other domains (financial, legal). For example, medical AV at lambda=0.5 achieves 100% expert accuracy in financial and legal domains.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No error bars or uncertainty quantification",
    406       "detail": "All results are presented as point estimates from apparently single runs. No confidence intervals, standard deviations, or variance across seeds are reported, making it impossible to assess result stability."
    407     },
    408     {
    409       "flag": "Large accuracy gap between method and joint training baseline",
    410       "detail": "Table 5 shows joint training achieves near-perfect accuracy (90-100%) while the AV method achieves 36-88% for dominant behaviors. The 12x speed advantage comes at a substantial accuracy cost that is acknowledged but not deeply analyzed."
    411     },
    412     {
    413       "flag": "Corporate affiliation evaluating alignment method",
    414       "detail": "Most authors are from AWS AI Labs, which has commercial interest in LLM alignment methods. No competing interests statement or funding disclosure is provided."
    415     },
    416     {
    417       "flag": "Entirely synthetic evaluation",
    418       "detail": "Both training and evaluation data are synthetically generated by Claude-3-Sonnet, with no validation on real-world domain queries from actual medical, legal, or financial professionals. The synthetic three-level proficiency distinction may not reflect real domain expertise gradations."
    419     },
    420     {
    421       "flag": "Small-scale human evaluation for data quality only",
    422       "detail": "Human evaluation involves only 3 annotators reviewing 30 queries for data quality (Section 3.3), not system output quality. The annotators' domain expertise is not described. No human evaluation of the aligned model's generated outputs is performed."
    423     },
    424     {
    425       "flag": "GPT-4 as judge without version specification",
    426       "detail": "GPT-4 is used as an evaluation judge (Section 5.1) without specifying which version, making the evaluation non-reproducible as GPT-4 behavior changes across versions."
    427     },
    428     {
    429       "flag": "Missing contemporary baselines",
    430       "detail": "Several related inference-time alignment methods are discussed (ITI, DeAl, SteerLM, CPO) but none are included as empirical baselines. Only prompting and joint training are compared against."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Training language models to follow instructions with human feedback",
    436       "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"],
    437       "year": 2022,
    438       "relevance": "Foundational RLHF paper for LLM alignment, the primary training-time alignment approach this work aims to complement."
    439     },
    440     {
    441       "title": "Direct preference optimization: Your language model is secretly a reward model",
    442       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"],
    443       "year": 2024,
    444       "relevance": "DPO algorithm used as the alignment training method in this paper to produce the aligned models from which AVs are derived."
    445     },
    446     {
    447       "title": "Editing models with task arithmetic",
    448       "authors": ["Gabriel Ilharco", "Marco Tulio Ribeiro", "Mitchell Wortsman"],
    449       "year": 2023,
    450       "relevance": "Core methodological inspiration — the paper's Alignment Vectors are directly based on the task arithmetic concept of subtracting base model weights from fine-tuned model weights."
    451     },
    452     {
    453       "title": "InferAligner: Inference-time alignment for harmlessness through cross-model guidance",
    454       "authors": ["Pengyu Wang", "Dong Zhang", "Linyang Li"],
    455       "year": 2024,
    456       "arxiv_id": "2401.11206",
    457       "relevance": "Inference-time alignment method using safety-related vectors to steer harmful outputs, closely related approach but focused on safety only."
    458     },
    459     {
    460       "title": "Inference-time intervention: Eliciting truthful answers from a language model",
    461       "authors": ["Kenneth Li", "Oam Patel", "Fernanda Viégas"],
    462       "year": 2024,
    463       "relevance": "Inference-time intervention method targeting truthfulness via activation shifting, related inference-time alignment technique."
    464     },
    465     {
    466       "title": "DeAl: Decoding-time alignment for large language models",
    467       "authors": ["James Y Huang", "Sailik Sengupta", "Daniele Bonadiman"],
    468       "year": 2024,
    469       "arxiv_id": "2402.06147",
    470       "relevance": "Decoding-time alignment via heuristic-guided search, an alternative inference-time alignment approach discussed as related work."
    471     },
    472     {
    473       "title": "SteerLM: Attribute conditioned SFT as an (user-steerable) alternative to RLHF",
    474       "authors": ["Yi Dong", "Zhilin Wang", "Makesh Narsimhan Sreedhar"],
    475       "year": 2023,
    476       "arxiv_id": "2310.05344",
    477       "relevance": "Controllable generation via attribute-conditioned training, related approach to user-steerable alignment but requires specific training format."
    478     },
    479     {
    480       "title": "Controllable preference optimization: Toward controllable multi-objective alignment",
    481       "authors": ["Yiju Guo", "Ganqu Cui", "Lifan Yuan"],
    482       "year": 2024,
    483       "arxiv_id": "2402.19085",
    484       "relevance": "Multi-objective controllable alignment via control tokens, related approach to multi-dimensional preference control."
    485     },
    486     {
    487       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    488       "authors": ["Jiaming Ji", "Mickel Liu", "Josef Dai"],
    489       "year": 2024,
    490       "relevance": "PKU-SafeRLHF safety preference dataset used in this paper for safety alignment experiments in Sections 6.3-6.4."
    491     },
    492     {
    493       "title": "Decoding-time realignment of language models",
    494       "authors": ["Tianlin Liu", "Shangmin Guo", "Leonardo Bianco"],
    495       "year": 2024,
    496       "arxiv_id": "2402.02992",
    497       "relevance": "Regularization-based inference-time alignment control, closely related approach that the authors argue lacks fine-grained preference level flexibility."
    498     },
    499     {
    500       "title": "TIES-merging: Resolving interference when merging models",
    501       "authors": ["Prateek Yadav", "Derek Tam", "Leshem Choshen"],
    502       "year": 2024,
    503       "relevance": "Advanced model merging technique cited as future work direction for improving alignment vector extraction beyond simple subtraction."
    504     },
    505     {
    506       "title": "A general theoretical paradigm to understand learning from human preferences",
    507       "authors": ["Mohammad Gheshlaghi Azar", "Zhaohan Daniel Guo", "Bilal Piot"],
    508       "year": 2024,
    509       "relevance": "IPO loss function framework used in this paper's DPO training for alignment."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 2,
    515       "justification": "Inference-time preference control via weight arithmetic is a useful deployment technique, but requires DPO training first and is tested on only one model."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "Extending task arithmetic from task performance to alignment preference is a natural extension of existing work, not a surprising finding."
    520     },
    521     "fear_safety": {
    522       "score": 1,
    523       "justification": "Brief discussion of safety implications — negative lambda can partially reverse safety alignment, but model doesn't become fully unsafe."
    524     },
    525     "drama_conflict": {
    526       "score": 0,
    527       "justification": "No controversy, no challenge to established methods or institutions."
    528     },
    529     "demo_ability": {
    530       "score": 0,
    531       "justification": "No code, demo, or working data link is released. Dataset link is promised but not provided."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "AWS AI Labs is a recognized but not top-tier ML research brand in the alignment space."
    536     }
    537   }
    538 }

Impressum · Datenschutz