ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31505B)


      1 {
      2   "paper": {
      3     "title": "Exposing Privacy Gaps: Membership Inference Attack on Preference Data for LLM Alignment",
      4     "authors": [
      5       "Qizhang Feng",
      6       "Siva Rajesh Kasa",
      7       "Santhosh Kasa",
      8       "Hyokun Yun",
      9       "Choon Hui Teo",
     10       "Sravan Bodapati"
     11     ],
     12     "year": 2025,
     13     "venue": "International Conference on Artificial Intelligence and Statistics (AISTATS) 2025",
     14     "arxiv_id": "2407.06443",
     15     "doi": "10.48550/arXiv.2407.06443"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["theoretical", "benchmark-eval"],
     20   "key_findings": "DPO-aligned LLMs are significantly more vulnerable to membership inference attacks (MIA) than PPO-aligned models, both theoretically (due to overfitting on preference data) and empirically (AUROC up to 0.93 for DPO vs ~0.52 for PPO on tuple detection). The novel PREMIA reference-based attack framework, which leverages the base/SFT model as a reference, consistently outperforms existing MIA baselines. The relationship between model size and MIA vulnerability is nuanced, depending on task complexity—larger pretrained models on easier tasks are less susceptible. PPO provides comparable utility to DPO while offering substantially better privacy protection.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper's own reproducibility checklist claims code is available 'as footnotes in §4.2', but these footnotes only link to external packages (TRL, PEFT, BitsAndBytes), not the authors' own experiment code. No repository URL or code archive is provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The experiments use publicly available datasets: Stack-Exchange-Paired (https://huggingface.co/datasets/lvwerra/stack-exchange-paired) and IMDB-RLHF-Pair (Rafailov et al., 2024). Both are accessible without restriction."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Appendix C lists package names (TRL, PEFT, BitsAndBytes) and LoRA settings, but provides no version numbers for any library, no requirements.txt, and no environment specification sufficient to recreate the setup."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided. Appendix C has implementation details (hyperparameters, training settings) but no commands, scripts, or README-style guide to replicate the experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All AUROC scores in Tables 1, 2, 3, 4, and 5 are reported as single point estimates with no confidence intervals, error bars, or ± notation."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims DPO is more vulnerable than PPO and that PREMIA outperforms baselines based solely on comparing raw AUROC numbers. No statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper does not report formal effect sizes. Raw AUROC values are presented in tables, but no Cohen's d, relative improvement, or other effect size measures are computed for DPO-vs-PPO or PREMIA-vs-baseline comparisons."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "80k training examples are used for SE and 20k for IMDB. No justification is given for these sizes, no power analysis is performed, and no discussion of whether these sizes are sufficient for the statistical claims being made."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviations, variance across seeds, or any spread measures are reported. All results appear to be single-run experiments."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple MIA baselines are compared: Perplexity (Yeom et al., 2018), Zlib (Carlini et al., 2021), Lowercase (Carlini et al., 2021), Ref, MIN-K (Shi et al., 2023), and Neighbourhood attack (Mattern et al., 2023). See Table 1 and Appendix A."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include recent methods: MIN-K% PROB (Shi et al., 2023/2024), DC-PDD (Zhang et al., 2024), and ReCALL (Xie et al., 2024). The Neighbourhood attack is from 2023. These represent the state of the art in LLM MIA."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper ablates the reference model choice: PREMIA-base (pretrained model as reference) vs PREMIA-SFT (SFT model as reference). Table 3 tests cross-family reference models (Open-llama-3b/7b swapped). Individual vs tuple attack modes are also compared."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "MIA performance uses AUROC across Chosen, Rejected, and Pair attack modes. Utility evaluation uses 10+ metrics: Reward, Perplexity, MSTTR-100, Distinct-1/2, Unique-1/2, BERTScore, ROUGE, BLEU, METEOR (Table 2)."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Human evaluation is not relevant to MIA vulnerability assessment, which is inherently a statistical detection task measured by AUROC."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "MIA evaluation inherently requires separation: training data (members) vs non-training data (non-members). SE uses data/rl for training and data/evaluation for validation. IMDB uses 20k for training and remaining for evaluation."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by model (9 models across GPT2 and larger families), dataset (SE, IMDB), response type (Chosen, Rejected, Pair), and attack method. Tables 1, 4, and 5 provide detailed per-category results."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 4.3.2 discusses where MIA is less effective: larger models on easier tasks (IMDB), PPO being nearly impervious (AUROC ~0.5), and the nuanced relationship between model size and vulnerability. Fig. 3 shows rapid DPO convergence explaining fast overfitting."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "PPO is shown to be nearly impervious to all MIA frameworks (AUROC ~0.5 in most settings). Table 4 IMDB results show PREMIA tuple detection fails for larger models (AUROC 0.500-0.556). Traditional MIA methods on GPT2 are barely above random (Table 5)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims (1) theoretical motivation for DPO vulnerability is supported by §2.4 Propositions 1-3 and Theorem 2.1; (2) PREMIA framework is introduced in §3 and validated in §4; (3) empirical demonstration of DPO vulnerability is supported by Tables 1 and 4."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims 'DPO tends to overfit on the preference data vis à vis PPO' and provides both theoretical justification (Propositions 1-2 show DPO has tighter training-error bounds) and controlled comparisons (same data, same base models, DPO vs PPO as the only variable)."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper generally stays within its evidence bounds, qualifying findings by model family and task difficulty. Section 4.3.2 notes the relationship between model size and vulnerability is 'more nuanced, depending on the task complexity.' The theoretical results are explicitly framed with their assumptions."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper attributes DPO's higher vulnerability entirely to overfitting via its theoretical framework. It does not discuss alternative explanations such as differences in training hyperparameters (DPO lr=5e-4 vs PPO lr=5.4e-5), training epochs, LoRA convergence behavior, or whether the reward model in PPO acts as a regularizer beyond the KL penalty."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures AUROC on membership inference, which directly corresponds to the claimed privacy vulnerability. No proxy gap exists—AUROC on train/non-train classification is the standard measure for MIA effectiveness."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are stated: Gemma-2-2B, Mistral-7B-v0.3, Mistral-7B-v0.1, Open-llama-3b, Open-llama-7b, GPT2, GPT2-medium, GPT2-large, GPT2-xl. These are identifiable open-source model checkpoints."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The paper does not use prompting as a methodology. It trains and fine-tunes models on preference datasets. Prompts come from the datasets themselves, not from experiment design."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix C provides detailed hyperparameters for SFT (lr=8e-5, epochs=2, batch=4, warmup=100, etc.), PPO (lr=5.4e-5, batch=16, KL coef=0.1, PPO epochs=6, etc.), DPO (lr=5e-4, epochs=3, beta=0.4, etc.), and LoRA settings (alpha=32, dropout=0.05, r=16)."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The paper trains and evaluates models directly."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Data splits are documented: SE uses data/rl for training (80k selected) and data/evaluation for validation. IMDB uses 20k for training and remaining for validation. Maximum length constraints are specified (prompt: 256, sequence: 1024, output: 128 for PPO)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 5 'Conclusion and Limitations' contains a substantive paragraph discussing limitations including: no exploration of mitigation techniques (DP-SGD, model pruning, privacy-aware losses), and PREMIA's requirement for base model access limiting applicability to closed-source LLMs."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations are specific to this study: (1) PREMIA's optimistic assumption of base model access limits real-world applicability for closed-source models; (2) the paper focuses only on DPO and PPO, leaving other alignment methods unexamined; (3) mitigation strategies like DP-SGD are not explored."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states: 'While we have focused on highlighting MIA vulnerabilities, we haven't touched upon the various ways to mitigate them.' It also notes PREMIA works for open-source models only: 'there is a need for designing effective frameworks for closed-source LLMs where there is no access to the base model.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The source datasets (SE, IMDB) are public, but the trained model checkpoints, intermediate outputs, and raw AUROC computation results are not released for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The datasets are described: SE contains Stack Overflow questions/answers with vote-based preferences; IMDB-RLHF-Pair has sentiment-based preferences. Data splits and sizes are specified in §4.2."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from publicly available benchmark datasets (Stack Exchange, IMDB)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented across sections: public datasets → SFT training → PPO/DPO alignment → MIA evaluation using various frameworks → AUROC computation. Implementation details in Appendix C cover each stage."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding disclosure or acknowledgments section is present. All authors are from Amazon Inc. but no statement about funding source is provided."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are listed under 'Amazon Inc.' affiliation on the first page."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Amazon (the implied funder) does not have a specific commercial stake in DPO being more vulnerable than PPO. The paper evaluates open-source models, not Amazon products."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial disclosure is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The paper trains models on specific preference datasets to study MIA vulnerability. It does not evaluate a pre-trained model's capability on a benchmark—the evaluation measures attack effectiveness on aligned models."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper studies MIA (detecting training data membership), not model knowledge on benchmarks. Train/test separation is the experimental variable, not a contamination concern."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The paper tests an attack framework's ability to detect training data membership, not a model's benchmark performance. Standard benchmark contamination concepts do not apply."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants. The study is entirely computational, evaluating models on public datasets."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. All experiments involve model training and automated evaluation."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference costs, API costs, or timing information reported for either the alignment training or the MIA evaluation. The computational cost of running PREMIA or baselines is not quantified."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No GPU hours, hardware specifications, or total compute budget is stated, despite training 9 models with both DPO and PPO across two datasets. The paper's own checklist claims computing infrastructure is described, but it is not visible in the provided text."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds. All results appear to be from single runs. Given that MIA evaluation depends on model training, which is stochastic, seed sensitivity is relevant but unreported."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged across multiple runs."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search is described. The paper uses fixed hyperparameters (Appendix C) without explaining how they were selected or how many configurations were tried."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The paper uses specific hyperparameter configurations (e.g., DPO lr=5e-4, beta=0.4; PPO lr=5.4e-5, KL coef=0.1) without justifying why these were chosen or how they were selected."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparative claims across 9 models × 8 attack methods × 2 datasets × 3 response types, but performs no formal statistical tests at all, let alone corrections for multiple comparisons."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors compare their PREMIA framework against baselines without acknowledging potential bias in running baselines in their own experimental setup. No independent evaluation or acknowledgment of author-evaluation bias."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "DPO and PPO have different computational requirements (DPO is typically cheaper as it skips reward model training), but the paper does not report or compare compute budgets for the two approaches."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "AUROC is used as the sole MIA metric without discussing whether it adequately captures real-world privacy risk. No discussion of whether AUROC on these specific datasets generalizes to actual privacy threats in deployment."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. The paper directly trains and evaluates models."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The base pretrained models (Mistral, Llama, GPT2) may have been trained on data overlapping with Stack Exchange or IMDB, which could affect baseline probability distributions and thus PREMIA's reference-based comparisons. This is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup might leak information. For instance, using the same base model for reference and alignment could create subtle dependencies not accounted for."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Train and non-train splits come from the same dataset distribution. No discussion of whether structural similarities between member and non-member examples could affect MIA evaluation validity."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete method is used to detect whether base pretrained models may have seen the preference data during pretraining, which could confound the MIA results."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "DPO-aligned models are more vulnerable to membership inference attacks than PPO-aligned models.",
    372       "evidence": "Theoretical: Propositions 1-2 show DPO has tighter training-error bounds (overfits more), Proposition 3 proves M(πDPO, z1) ≥ M(πPPO, z1) under MALT assumption, Theorem 2.1 provides a general lower bound linking overfitting to MIA susceptibility. Empirical: Tables 1 and 4 show consistently higher AUROC for DPO across all models, datasets, and attack methods.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "PREMIA consistently achieves higher AUROC than existing MIA frameworks on DPO-aligned models.",
    377       "evidence": "Table 1 shows PREMIA-SFT achieves highest or second-highest AUROC in most DPO columns across Gemma-2-2B, Mistral-7B, Open-llama models. For SE tuple detection, PREMIA reaches 0.93 (Figure 2) vs ~0.5-0.6 for baselines. Table 4 confirms across GPT2 series.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "PPO provides similar utility performance to DPO with substantially lower MIA vulnerability.",
    382       "evidence": "Table 2 shows Mistral-7B PPO has better Reward (-0.771 vs -1.035), comparable BERTScore (0.883 vs 0.877), ROUGE (0.457 vs 0.443), while MIA AUROC is ~0.52 vs 0.80-0.93 for DPO.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "The relationship between model size and MIA vulnerability depends on task complexity.",
    387       "evidence": "Section 4.3.2: GPT2 models show higher vulnerability on DPO than larger models (Table 4 SE vs Table 1). For easier IMDB task, larger models like Mistral-7B show lower vulnerability. Fig. 3 shows Mistral-7B achieves >90% DPO accuracy in 0.2 epochs on IMDB, indicating rapid convergence on easy tasks.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "DPO overfits on preference data more than PPO, which is the theoretical mechanism behind its higher MIA vulnerability.",
    392       "evidence": "Propositions 1-2 show DPO's error bound on training data is 2εr (reward error only) while PPO's population bound includes +2εy (response distribution error), indicating DPO fits training distribution better. Theorem 2.1 connects overfitting probability to MIA score lower bound.",
    393       "supported": "moderate"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No error bars or uncertainty quantification",
    399       "detail": "All AUROC scores across Tables 1-5 are single point estimates. No confidence intervals, standard deviations, or multiple-run statistics are reported. Many DPO-vs-PPO differences are small (e.g., 0.56 vs 0.53) and could be within noise."
    400     },
    401     {
    402       "flag": "No significance tests for comparative claims",
    403       "detail": "The paper makes extensive claims about DPO being more vulnerable than PPO and PREMIA outperforming baselines, but no statistical significance tests support any of these comparisons. Hundreds of pairwise comparisons are made by visual inspection alone."
    404     },
    405     {
    406       "flag": "Confounded DPO-vs-PPO comparison",
    407       "detail": "DPO and PPO use substantially different hyperparameters (DPO lr=5e-4 vs PPO lr=5.4e-5; DPO beta=0.4; different epoch counts; PPO has reward model). The vulnerability difference could partially stem from these hyperparameter choices rather than the alignment method itself."
    408     },
    409     {
    410       "flag": "Utility comparison on single model/dataset only",
    411       "detail": "The privacy-utility trade-off analysis (Table 2) is shown only for Mistral-7B on the SE dataset. The claim that 'PPO provides similar utility' is not validated across the other 8 models and second dataset."
    412     },
    413     {
    414       "flag": "Amazon affiliation with no funding or conflict disclosure",
    415       "detail": "All six authors are from Amazon Inc. No funding disclosure, acknowledgments, or competing interests statement is present, despite Amazon being a major provider of LLM services."
    416     },
    417     {
    418       "flag": "Theoretical results rely on restrictive assumptions",
    419       "detail": "Proposition 3 relies on the MALT assumption, which the paper itself calls 'quite restrictive.' Theorem 2.1 adds the assumption that likelihoods are i.i.d. under the posterior. A footnote notes an assumption was 'added in a later revision to address a limitation in the original analysis.'"
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Direct preference optimization: Your language model is secretly a reward model",
    425       "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell", "C. Manning", "S. Ermon", "C. Finn"],
    426       "year": 2024,
    427       "relevance": "Foundational DPO method, a central subject of this paper's privacy analysis of LLM alignment techniques."
    428     },
    429     {
    430       "title": "Detecting pre-training data from large language models",
    431       "authors": ["W. Shi", "A. Ajith", "M. Xia", "Y. Huang", "D. Liu", "T. Blevins", "D. Chen", "L. Zettlemoyer"],
    432       "year": 2024,
    433       "arxiv_id": "2310.16789",
    434       "relevance": "Proposes MIN-K% PROB MIA method used as a baseline; studies membership inference on pretrained LLMs."
    435     },
    436     {
    437       "title": "Membership inference attacks against machine learning models",
    438       "authors": ["R. Shokri", "M. Stronati", "C. Song", "V. Shmatikov"],
    439       "year": 2017,
    440       "relevance": "Foundational work on membership inference attacks in machine learning, establishing the threat model used in this paper."
    441     },
    442     {
    443       "title": "Extracting training data from large language models",
    444       "authors": ["N. Carlini"],
    445       "year": 2021,
    446       "relevance": "Key prior work on LLM privacy risks; Zlib and Lowercase MIA baselines used in this paper originate from this work."
    447     },
    448     {
    449       "title": "Training language models to follow instructions with human feedback",
    450       "authors": ["L. Ouyang", "J. Wu", "X. Jiang", "D. Almeida"],
    451       "year": 2022,
    452       "relevance": "InstructGPT paper establishing RLHF/PPO pipeline for LLM alignment, one of the two alignment methods studied."
    453     },
    454     {
    455       "title": "Deep learning with differential privacy",
    456       "authors": ["M. Abadi", "A. Chu", "I. Goodfellow", "H.B. McMahan", "I. Mironov", "K. Talwar", "L. Zhang"],
    457       "year": 2016,
    458       "relevance": "DP-SGD framework mentioned as a potential MIA mitigation technique for preference data alignment."
    459     },
    460     {
    461       "title": "Is DPO superior to PPO for LLM alignment? A comprehensive study",
    462       "authors": ["S. Xu", "W. Fu", "J. Gao", "W. Ye", "W. Liu", "Z. Mei", "G. Wang", "C. Yu", "Y. Wu"],
    463       "year": 2024,
    464       "arxiv_id": "2404.10719",
    465       "relevance": "Comprehensive comparison of DPO vs PPO for alignment quality, directly relevant to the utility-privacy trade-off analyzed here."
    466     },
    467     {
    468       "title": "Do membership inference attacks work on large language models?",
    469       "authors": ["M. Duan", "A. Suri", "N. Mireshghallah", "S. Min", "W. Shi", "L. Zettlemoyer", "Y. Tsvetkov", "Y. Choi", "D. Evans", "H. Hajishirzi"],
    470       "year": 2024,
    471       "arxiv_id": "2402.07841",
    472       "relevance": "Finds most MIAs barely outperform random guessing on pretrained LLMs, motivating the focus on fine-tuned/aligned models in this paper."
    473     },
    474     {
    475       "title": "LLM dataset inference: Did you train on my dataset?",
    476       "authors": ["P. Maini", "H. Jia", "N. Papernot", "A. Dziedzic"],
    477       "year": 2024,
    478       "arxiv_id": "2406.06443",
    479       "relevance": "Proposes Dataset Inference Attack for pretrained LLMs, complementary to the preference data MIA studied here."
    480     },
    481     {
    482       "title": "White-box vs black-box: Bayes optimal strategies for membership inference",
    483       "authors": ["A. Sablayrolles", "M. Douze", "C. Schmid", "Y. Ollivier", "H. Jégou"],
    484       "year": 2019,
    485       "relevance": "Provides the theoretical framework (MALT assumption, Bayes optimal membership inference) that underpins the theoretical results in this paper."
    486     },
    487     {
    488       "title": "Fundamental limits of membership inference attacks on machine learning models",
    489       "authors": ["E. Aubinais", "E. Gassiat", "P. Piantanida"],
    490       "year": 2023,
    491       "arxiv_id": "2310.13786",
    492       "relevance": "Provides fundamental lower bounds on MIA that this paper extends with a tighter bound (Lemma 3, Theorem 2.1)."
    493     },
    494     {
    495       "title": "Practical membership inference attacks against fine-tuned large language models via self-prompt calibration",
    496       "authors": ["W. Fu", "H. Wang", "C. Gao", "G. Liu", "Y. Li", "T. Jiang"],
    497       "year": 2023,
    498       "arxiv_id": "2311.06062",
    499       "relevance": "Studies MIA on fine-tuned LLMs, directly related to the alignment-stage MIA focus of this paper."
    500     }
    501   ],
    502   "engagement_factors": {
    503     "practical_relevance": {
    504       "score": 1,
    505       "justification": "PREMIA could be used for privacy auditing of aligned LLMs, but requires reference model access and no code is released."
    506     },
    507     "surprise_contrarian": {
    508       "score": 1,
    509       "justification": "DPO's higher vulnerability is somewhat intuitive given it directly optimizes on preference data; the finding confirms rather than contradicts expectations."
    510     },
    511     "fear_safety": {
    512       "score": 2,
    513       "justification": "Demonstrates that widely-used DPO alignment leaks preference data membership, raising real privacy concerns for organizations fine-tuning on sensitive human feedback."
    514     },
    515     "drama_conflict": {
    516       "score": 1,
    517       "justification": "Mild tension: DPO is popular for its simplicity but shown to have a privacy cost, though this is presented as a technical finding rather than a controversy."
    518     },
    519     "demo_ability": {
    520       "score": 0,
    521       "justification": "No code released, no demo, no public tool available."
    522     },
    523     "brand_recognition": {
    524       "score": 1,
    525       "justification": "Amazon authors provide some recognition, but the paper is about open-source models (GPT2, Mistral, Llama), not Amazon products."
    526     }
    527   }
    528 }

Impressum · Datenschutz