scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29613B)
      1 {
      2   "paper": {
      3     "title": "Probing Language Models for Pre-training Data Detection",
      4     "authors": [
      5       "Zhenhua Liu",
      6       "Tong Zhu",
      7       "Chuanyuan Tan",
      8       "Haonan Lu",
      9       "Bing Liu",
     10       "Wenliang Chen"
     11     ],
     12     "year": 2024,
     13     "venue": "Annual Meeting of the Association for Computational Linguistics",
     14     "arxiv_id": "2406.01333",
     15     "doi": "10.48550/arXiv.2406.01333"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "The paper proposes using linear probes on LLM internal activations for pre-training data detection, achieving state-of-the-art AUC on WikiMIA (69.8) and ArxivMIA (60.3). The new ArxivMIA benchmark, comprising arxiv abstracts from CS and Math, proves substantially more challenging than WikiMIA across all detection methods. The probing approach benefits from larger models and is data-efficient (optimal at ~200 training examples), but requires domain-specific training data and additional compute for proxy model fine-tuning compared to baselines.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states in a footnote: 'Our code and dataset are available at https://github.com/zhliu0106/probing-lm-data' (Section 1)."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The ArxivMIA benchmark dataset is released at the same GitHub repository. They also use the publicly available WikiMIA benchmark."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions '2 NVIDIA A100 (40GB) GPUs' (Section 5.4) but provides no requirements.txt, Dockerfile, or detailed library version specifications."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository may contain them, but the paper itself does not include a 'Reproducing Results' section or detailed run commands."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results in Tables 2, 3, 4, 5, and 6 report only point estimates (AUC values, TPR) with no confidence intervals or error bars."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims 'our method consistently outperforms all baselines' (Section 6.1) based solely on comparing numerical AUC/TPR values without any statistical significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Tables 2 and 3 show absolute AUC and TPR values for both baselines and the proposed method, allowing readers to assess the magnitude of improvement (e.g., 69.8 vs 65.5 on Pythia WikiMIA)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for benchmark sizes (776 for WikiMIA, 2000 for ArxivMIA) or training data sizes. The ablation on training data number (Figure 3) explores different sizes but doesn't justify the final choices."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "All reported results appear to be single-run numbers. No standard deviation, variance, or spread measures are provided anywhere in the paper."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Six baselines are compared: Loss Attack, Neighbor Attack, Min-K% Prob, Zlib Compression, Lowercased Text, and Smaller Model (Section 5.3)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include Min-K% Prob (Shi et al., 2023) and Neighbor Attack (Mattern et al., 2023), which are recent and represent the state of the art in pre-training data detection."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Multiple ablations are conducted: model size (Figure 2), training data number (Figure 3), prompt template selection (Table 8), real vs synthetic data (Tables 2-3), full fine-tuning vs LoRA (Table 5), and cross-domain evaluation (Table 6)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two metrics are used: AUC (Table 2) and TPR at 5% FPR (Table 3), as stated in Section 5.2."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "This is a binary classification task (member vs non-member) with known ground truth labels. Human evaluation is irrelevant — automated metrics fully capture detection performance."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "'We split each dataset into a validation set and a test set in a ratio of 1:4. The validation set is used to select the best hyperparameters, and the test set is used to evaluate the performance' (Section 5.1)."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by benchmark (WikiMIA, ArxivMIA), by model (Pythia, OPT, TinyLLaMA, OpenLLaMA), and by ArxivMIA subcategory (CS vs Math) in Tables 2 and 3."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 6.1 discusses poor performance on ArxivMIA-Math ('values are only above 50'). Section 6.4 acknowledges 'the overall detection efficacy is unsatisfactory' at low duplication counts. Cross-domain performance degradation is discussed in Section 6.2."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Performance decline with >200 training samples (Figure 3), LoRA underperforming full fine-tuning (Table 5, 62.7 vs 74.3), cross-domain AUC drops (Table 6), and near-chance performance on contamination detection challenge (Table 4) are all reported."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 'outperforms all the baselines, and achieves state-of-the-art performance on both WikiMIA and ArxivMIA' which is supported by Tables 2 and 3 showing the highest average AUC and TPR values."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The ablation studies (model size, training data number, prompt template, LoRA vs fine-tuning) use controlled single-variable manipulation to support causal claims about component contributions. The proxy model injection step is a controlled intervention."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Probing Language Models for Pre-training Data Detection' is broad, but experiments are limited to 4 models (up to 13B), 2 benchmarks, and English text only. The Limitations section notes domain-specific data requirements but the abstract and title do not bound the claims to the tested setting."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for why the probing method works. For example, the probe might detect stylistic or distributional differences between member and non-member data rather than actual pre-training memorization signals. No confound analysis is provided."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures AUC and TPR@FPR for pre-training data detection, and claims are framed at the same granularity — detecting whether text was in pre-training data. No broader proxy gap exists between measurement and framing."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model names with sizes are given: Pythia-2.8B, OPT-6.7B, TinyLLaMA-1.1B, OpenLLaMA-13B, Pythia-70M, OPT-350M, OpenLLaMA-3B, Contam-1.4b, RoBERTa-base (Section 5.4). These are specific, versioned open-source models."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The actual prompt template is provided verbatim: 'Here is a statement: [SAMPLE] \\n Is the above statement correct? Answer:' (Section 3.3). All alternative templates tested are also shown in Table 8 (Appendix B)."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper states 'the best choice is to put all the data to be injected into one batch and train for 2 epochs' and mentions grid search, but does not report the actual learning rates, which layer activations were extracted from, or other hyperparameter values for each model-dataset combination."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The method is a direct pipeline: fine-tune proxy model → extract activations → train linear probe."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4 documents ArxivMIA construction (post-2024 non-member, RedPajama member), Section 4.2 describes training data collection (real vs synthetic), prompt template processing (Section 3.3), and dataset splits (1:4 validation/test, Section 5.1). Appendix A details synthetic data generation."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "A dedicated 'Limitations' section discusses two specific limitations: generalization requiring domain-specific training data, and computational resource requirements with LoRA comparison."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations are specific to this study: 'the generalizability of the probe classifier, which necessitates domain-specific training data' and the computational overhead with a concrete LoRA comparison showing 62.7 vs 74.3 AUC (Table 5)."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what was NOT tested. The mention of 'future work could extend our methods to larger model scales or apply them to multi-modal models' implicitly suggests untested areas but does not explicitly bound the current scope (e.g., English-only, models up to 13B, specific domains only)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The ArxivMIA dataset and code are released at https://github.com/zhliu0106/probing-lm-data. WikiMIA is a public benchmark. The underlying data can be independently verified."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4.1 describes ArxivMIA construction: member data from RedPajama arxiv subset, non-member from post-2024 arxiv abstracts. Section 4.2 describes training data collection (real data post-model-release, synthetic via ChatGPT). Table 1 provides counts."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are standard benchmarks (WikiMIA) and arxiv/RedPajama (ArxivMIA)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented: data collection → member/non-member split → proxy model fine-tuning → activation extraction → probe training → evaluation. Dataset sizes at each stage are given in Table 1 and Section 4.2."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgments section lists: 'National Natural Science Foundation of China (Grant No. 62036004, 62376177)', Provincial Key Laboratory for Computer Information Processing Technology, Collaborative Innovation Center, and Priority Academic Program Development."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Soochow University (Institute of Artificial Intelligence) and OPPO AI Center, China."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Funding is from the National Natural Science Foundation of China and academic institutions, which have no financial stake in the method's performance. OPPO is an affiliation but the paper does not evaluate OPPO products."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not explicitly state training data cutoff dates for the evaluated models. It references that TinyLLaMA/OpenLLaMA were 'pre-trained on RedPajama' and ArxivMIA uses post-2024 non-member data, but specific cutoff dates for each model are not provided."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "The entire paper is about detecting train/test overlap. The benchmarks are constructed with known member/non-member splits based on temporal cutoffs and training data provenance (Section 4)."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "ArxivMIA is designed with contamination awareness: non-member data is post-2024, member data is from RedPajama (known training data). WikiMIA similarly uses temporal splits. The ground truth membership is established by construction."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference cost, wall-clock time, or cost per example is reported. The method requires fine-tuning a proxy model and training a probe, but the computational cost of these steps is not quantified."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper mentions '2 NVIDIA A100 (40GB) GPUs' (Section 5.4) but does not state total GPU hours, training time, or overall computational budget."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds. All results appear to be from single runs without seed sensitivity analysis."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never explicitly stated. Results are presented without indication of how many runs produced them."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The paper states 'we conducted a grid search hyperparameters on a held-out validation set' (Section 5.4) but does not report how many configurations were tried or total compute spent on search."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Selection is done on a validation set: 'we set the best learning rate and activation extraction model layer according to the performance of the validation set' (Section 5.4). The validation/test split is 1:4."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper compares 8 methods across 8 model-benchmark combinations without any correction for multiple comparisons or family-wise error rate control."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement and evaluate their own method against baselines they also implement, without acknowledging author-evaluation bias or using independent evaluation."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The proposed method requires fine-tuning a proxy model and training a probe classifier, which is substantially more expensive than reference-free baselines (which need only one forward pass). This compute difference is not quantified or discussed as a trade-off."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper does not discuss whether AUC on WikiMIA/ArxivMIA actually measures real-world pre-training data detection ability. The construct validity of using temporal splits as ground truth for membership is not questioned."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No agentic scaffolding is involved in this work."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": true,
    350         "justification": "Temporal splits are explicitly used: ArxivMIA designates 'abstracts published post-2024 as non-member data' while member data comes from RedPajama (Section 4.1). WikiMIA also uses temporal splits by design."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The paper does not discuss whether features other than actual membership (e.g., text style differences between pre-2024 and post-2024 arxiv abstracts, or domain shifts in RedPajama) could leak membership status to the probe."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The paper does not discuss whether member and non-member samples might share structural similarities or whether there are distributional differences between RedPajama-sourced members and post-2024 non-members beyond actual pre-training membership."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "The paper's method itself IS a leakage detection technique (probing for pre-training data membership). Additionally, temporal splits serve as a concrete prevention method for benchmark construction."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "The probing method outperforms all baselines on both WikiMIA and ArxivMIA benchmarks, achieving state-of-the-art AUC values.",
    372       "evidence": "Table 2 shows the highest AUC for Probe Attack across all model-benchmark combinations (e.g., 69.8 on Pythia WikiMIA vs next-best 65.5). Table 3 shows highest average TPR@5%FPR (11.8 with real data).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "ArxivMIA is a more challenging benchmark than WikiMIA for pre-training data detection.",
    377       "evidence": "Table 2 shows consistently lower AUC values on ArxivMIA vs WikiMIA across all methods. Section 6.1 notes 'the average performance across all detection methods is notably lower on ArxivMIA.' Cross-domain results in Table 6 also support this.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The probing method benefits from larger model sizes.",
    382       "evidence": "Figure 2 shows AUC increasing with OpenLLaMA size (3B→7B→13B) for the probing method, while Neighbor Attack shows no significant change. Only 3 data points on one model family.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "The method is data-efficient, with optimal performance at around 200 training samples.",
    387       "evidence": "Figure 3 shows AUC peaking at 200 synthetic training samples on ArxivMIA with TinyLLaMA, with slight decline at 500 and 1000. Tested on only one model and one benchmark.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Both real and synthetic training data are effective for the probing method.",
    392       "evidence": "Tables 2 and 3 show comparable performance for 'Probe w. Real Data' and 'Probe w. Synthetic Data' across all benchmarks, with real slightly better on WikiMIA and synthetic slightly better on ArxivMIA.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Mathematical content is harder for LLMs to memorize and harder for the method to detect than CS content.",
    397       "evidence": "Table 2 shows ArxivMIA-Math AUC values ~10 points lower than ArxivMIA-CS across all models (e.g., TinyLLaMA: 56.7 vs 64.3 with synthetic data).",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No error bars or variance reporting",
    404       "detail": "All results across all tables appear to be single-run point estimates with no standard deviation, confidence intervals, or indication of result stability. Given the stochastic nature of fine-tuning the proxy model, results could vary substantially across runs."
    405     },
    406     {
    407       "flag": "Modest improvements without significance testing",
    408       "detail": "AUC improvements over baselines are often modest (e.g., 69.8 vs 65.5 on Pythia WikiMIA, ~4 points) and could fall within noise without statistical significance tests. The TPR@5%FPR improvements are even smaller."
    409     },
    410     {
    411       "flag": "Unquantified compute overhead",
    412       "detail": "The method requires fine-tuning a full proxy model and training a probe classifier, which is substantially more expensive than all baselines (which need at most one forward pass). This cost-performance trade-off is never quantified, making practical comparison unfair."
    413     },
    414     {
    415       "flag": "Potential confound between temporal/stylistic signals and membership",
    416       "detail": "ArxivMIA member data comes from RedPajama (older arxiv papers) while non-member data is post-2024. The probe could be detecting temporal or stylistic shifts in academic writing rather than actual pre-training memorization. This confound is not discussed."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Detecting pretraining data from large language models",
    422       "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia", "Yangsibo Huang", "Daogao Liu", "Terra Blevins", "Danqi Chen", "Luke Zettlemoyer"],
    423       "year": 2023,
    424       "arxiv_id": "2310.16789",
    425       "relevance": "Proposes Min-K% Prob method and WikiMIA benchmark for pre-training data detection, a key baseline and evaluation setting in this paper."
    426     },
    427     {
    428       "title": "Extracting training data from large language models",
    429       "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace", "Matthew Jagielski", "Ariel Herbert-Voss", "Katherine Lee", "Adam Roberts", "Tom Brown", "Dawn Song", "Ulfar Erlingsson"],
    430       "year": 2021,
    431       "relevance": "Foundational work on training data extraction from LLMs, demonstrating privacy risks of large language models."
    432     },
    433     {
    434       "title": "Membership inference attacks against language models via neighbourhood comparison",
    435       "authors": ["Justus Mattern", "Fatemehsadat Mireshghallah", "Zhijing Jin", "Bernhard Schoelkopf", "Mrinmaya Sachan", "Taylor Berg-Kirkpatrick"],
    436       "year": 2023,
    437       "relevance": "Proposes the Neighbor Attack for membership inference in language models, a key baseline method."
    438     },
    439     {
    440       "title": "Proving test set contamination in black box language models",
    441       "authors": ["Yonatan Oren", "Nicole Meister", "Niladri Chatterji", "Faisal Ladhak", "Tatsunori B. Hashimoto"],
    442       "year": 2023,
    443       "relevance": "Presents statistical test for benchmark contamination and the contamination detection challenge used for evaluation in Section 6.4."
    444     },
    445     {
    446       "title": "Data contamination quiz: A tool to detect and estimate contamination in large language models",
    447       "authors": ["Shahriar Golchin", "Mihai Surdeanu"],
    448       "year": 2023,
    449       "arxiv_id": "2311.06233",
    450       "relevance": "Proposes a multiple-choice format for assessing LLM data contamination, directly relevant to benchmark contamination detection."
    451     },
    452     {
    453       "title": "Membership inference attacks from first principles",
    454       "authors": ["Nicholas Carlini", "Steve Chien", "Milad Nasr", "Shuang Song", "Andreas Terzis", "Florian Tramer"],
    455       "year": 2022,
    456       "relevance": "Foundational framework for membership inference attacks in machine learning, establishing evaluation metrics used in this paper."
    457     },
    458     {
    459       "title": "Quantifying privacy risks of masked language models using membership inference attacks",
    460       "authors": ["Fatemehsadat Mireshghallah", "Kartik Goyal", "Archit Uniyal", "Taylor Berg-Kirkpatrick", "Reza Shokri"],
    461       "year": 2022,
    462       "relevance": "Investigates membership inference for masked language models, extending MIA techniques to NLP settings."
    463     },
    464     {
    465       "title": "Pythia: A suite for analyzing large language models across training and scaling",
    466       "authors": ["Stella Biderman", "Hailey Schoelkopf", "Quentin Gregory Anthony"],
    467       "year": 2023,
    468       "relevance": "Provides the Pythia model suite with known training data (the Pile), used as a target model for evaluation."
    469     },
    470     {
    471       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    472       "authors": ["Oscar Sainz", "Jon Ander Campos", "Iker García-Ferrero", "Julen Etxaniz", "Oier Lopez de Lacalle", "Eneko Agirre"],
    473       "year": 2023,
    474       "arxiv_id": "2310.18018",
    475       "relevance": "Highlights the importance of measuring data contamination in LLM benchmark evaluations, motivating the need for detection methods."
    476     },
    477     {
    478       "title": "Membership inference attacks on machine learning: A survey",
    479       "authors": ["Hongsheng Hu", "Zoran Salcic", "Lichao Sun", "Gillian Dobbie", "Philip S. Yu", "Xuyun Zhang"],
    480       "year": 2022,
    481       "relevance": "Comprehensive survey of membership inference attacks in machine learning, providing broader context for the pre-training data detection problem."
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 1,
    487       "justification": "Useful for researchers studying data contamination in LLMs but requires model weights and fine-tuning, limiting immediate practical deployment."
    488     },
    489     "surprise_contrarian": {
    490       "score": 1,
    491       "justification": "Novel approach of probing internal activations rather than surface features, but builds incrementally on established MIA and probing literatures."
    492     },
    493     "fear_safety": {
    494       "score": 2,
    495       "justification": "Directly addresses privacy concerns and benchmark contamination risks in LLMs, which are active safety and integrity concerns."
    496     },
    497     "drama_conflict": {
    498       "score": 1,
    499       "justification": "Touches on the topical issue of LLM data contamination and benchmark reliability, but presents findings without controversy."
    500     },
    501     "demo_ability": {
    502       "score": 2,
    503       "justification": "Code and dataset released on GitHub, allowing researchers to run the detection pipeline on their own models."
    504     },
    505     "brand_recognition": {
    506       "score": 0,
    507       "justification": "From Soochow University and OPPO AI Center, which are not widely recognized in the LLM research community."
    508     }
    509   }
    510 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs