ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25955B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Good and The Bad: Exploring Privacy Issues in Retrieval-Augmented Generation (RAG)",
      6     "authors": [
      7       "Shenglai Zeng",
      8       "Jiankun Zhang",
      9       "Pengfei He",
     10       "Yue Xing",
     11       "Yiding Liu"
     12     ],
     13     "year": 2024,
     14     "venue": "Annual Meeting of the Association for Computational Linguistics",
     15     "arxiv_id": "2402.16893",
     16     "doi": "10.48550/arXiv.2402.16893"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's three main claims — novel attacks demonstrating retrieval data leakage, high vulnerability, and RAG mitigating training data leakage — are each supported by Tables 1–3 and Section 5's experimental results.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The causal claim that RAG reduces training data memorization is supported by controlled comparison conditions (None, Random Noise, System Prompt, RAG variants) in Table 3, isolating the effect of retrieval augmentation.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper makes broad claims about 'RAG systems' vulnerability based on only three models and two datasets without bounding conclusions to these specific settings; the conclusion states findings 'benefit both LLMs and RAG systems builders' without qualification.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section 5.4 offers a single Bayesian explanation for why RAG reduces training data leakage without considering alternatives (e.g., context window flooding, attention dilution); no alternative explanations are considered for RQ1 findings.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures direct privacy leakage (exact text repetitions, ROUGE-L similarity, extracted PII counts) that directly corresponds to the claimed privacy risks, with no proxy gap between measurement and claim.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Limitations' is a dedicated section going beyond a single sentence, identifying multiple scope boundaries.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations section only notes the study focused on inference-stage RAG; it does not address specific threats such as potential LLM training-data contamination of the Enron/HealthcareMagic datasets or confounds in single-run attack evaluations.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 7 explicitly states the study 'concentrated primarily on the application of retrieval augmentation during the inference stage, without delving into its integration during pre-training or fine-tuning phases.'",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No acknowledgment or funding disclosure section appears in the paper; Baidu (a commercial entity with RAG deployment interests) is a co-author affiliation but no funding sources are mentioned.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Michigan State University, Baidu Inc., Jilin University, and associated research centers.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Funding is not disclosed, making independence unassessable; Baidu co-authors have potential commercial interest in RAG deployment findings.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "RAG is formally defined with mathematical notation in Section 3.1, the threat model (black-box attacker via API queries) is explicitly stated, and attack types (targeted vs untargeted) are operationally defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states it contributes novel composite structured prompting attack methods and empirical evidence that RAG mitigates training data leakage, framed around two explicit research questions.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 and Section 3.2 explicitly differentiate this work from prior prompt extraction attacks (targeting fixed system prompts) and training data extraction attacks (ignoring retrieval context), situating the novel contribution clearly.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract explicitly states 'Our code is available at https://github.com/phycholosogy/RAG-privacy.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both datasets used (Enron Email and HealthcareMagic-101) are standard publicly available datasets, not proprietary.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or specific dependency versions are mentioned in the paper; only model names (bge-large-en-v1.5, Chroma) are given without pinned versions.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper; the code is released but the paper itself lacks instructions for replicating the experimental setup.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1–3 and figures are reported as raw counts without confidence intervals, error bars, or standard deviations across multiple runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claims; conditions are compared purely by inspection of raw counts.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Percentage reductions are mentioned qualitatively (e.g., 'approximately 50%' for summarization) but no formal effect sizes with baseline context are systematically reported.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "250 prompts for attack experiments and 5000/1000 for memorization attacks are used without any justification for these choices or power analysis.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All experiments appear to be single runs with no repeated measurements; no variance or standard deviation is reported for any result.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "For RQ2, explicit baselines include no retrieval (None), random noise injection, and system prompts, enabling comparison against RAG integration; for RQ1, multiple model comparisons are provided.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The baselines (noise injection and protective system prompts) are standard contemporary approaches in LLM privacy defense literature and are appropriate comparisons.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4.4 and Appendix A.1 conduct ablation studies on k (retrieved documents), command prompt design, embedding models (MiniLM, BGE, E5), and temperature parameters.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: Retrieval Contexts, Repeat Prompts, Repeat Contexts, ROUGE Prompts, ROUGE Contexts, and Targeted Information extraction counts.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not applicable for this automated privacy leakage measurement study.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Appendix A.3 describes a 99:1 train/test split for performance evaluation, with 1000 instances randomly sampled from the test set.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are consistently broken down by dataset (HealthcareMagic vs Enron Email) and by model (L7C, L13C, GPT-3.5-turbo) throughout the main tables.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses cases where attack scaling with k saturates due to model capacity constraints, and explicitly shows where defenses fail (re-ranking providing no benefit).",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Re-ranking 'has almost no mitigation effects' (Figures 4a, 4b) and summarization is ineffective against targeted attacks are clearly reported as negative results for defenses.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "GPT-3.5-turbo is listed without a version snapshot date, and Llama-7b-chat/13b-chat without specific checkpoint hashes; the embedding model bge-large-en-v1.5 has a version but LLM versions are underspecified.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Appendices A.2.1–A.2.3 provide actual prompt templates for targeted attacks, untargeted attacks, system prompts (Table 10), and summarization defenses (Table 11).",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Key hyperparameters are reported: k=2 retrieved documents (ablated over k=1,2,4), L2-norm similarity metric, temperature variations (0, 0.6, 1), and specific embedding model choices.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The RAG pipeline is described with formal mathematical notation in Section 3.1, including the retrieval formulation, k-NN distance metric, and generation step.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Data construction is described: each doctor-patient dialogue embedded as one piece in HealthcareMagic, each email as one piece in Enron, with 99:1 train/test split documented in Appendix A.3.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Code is released but no explicit release of raw experimental outputs (model responses, extracted texts) is mentioned; only the source datasets are publicly available.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The data construction procedure is described: using existing Enron Email (500k emails) and HealthcareMagic-101 (200k dialogues) datasets with documented embedding and storage into Chroma vector database.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants were recruited; standard existing datasets were used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from dataset loading to database construction, attack execution, and metric computation is documented in Sections 3–4 and Appendix A.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoffs are stated for any of the models tested (GPT-3.5-turbo, Llama-7b-chat, Llama-13b-chat, GPT-Neo-1.3B).",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The paper does not discuss whether GPT-3.5-turbo or Llama models may have seen the Enron Email or HealthcareMagic datasets during pretraining, a significant confound for RQ1 where 'retrieval leakage' could actually be training data recall.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The Enron Email dataset is widely used and likely present in many LLMs' training corpora; this potential contamination is not addressed as a confound for the retrieval leakage experiments.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants involved.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants involved.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants involved.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants involved.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants involved.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants involved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants involved.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference costs, API costs, or latency measurements are reported despite using GPT-3.5-turbo (a paid API) for thousands of queries.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, GPU hours, or resource requirements are mentioned anywhere in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "RAG systems are highly vulnerable to retrieval data extraction attacks with substantial success rates for both targeted and untargeted attacks",
    375       "evidence": "Table 2: Llama-7b-Chat extracts 89/445 targeted medical dialogue chunks and 107/322 Enron PIIs; GPT-3.5-turbo achieves 148/445 and 205/322; Table 1 shows 100+ ROUGE Contexts per 250 prompts across models",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Re-ranking has almost no mitigation effect against retrieval data extraction",
    380       "evidence": "Tables 18–19 and Figures 4a–4b show nearly identical extraction counts with and without re-ranking across both datasets and attack types",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Abstractive summarization reduces untargeted attack success by approximately 50% but offers limited protection against targeted attacks",
    385       "evidence": "Table 20 shows untargeted repeat contexts drop from 117 to 46 (HealthcareMagic) with abstractive summarization; Table 21 shows targeted information extraction drops only from 89 to 41",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Increasing retrieved document count k does not substantially increase privacy leakage due to model capacity constraints",
    390       "evidence": "Figures 3 and Tables 16–17 show that doubling k from 2 to 4 does not proportionally increase extracted contexts, attributed to models' limited capacity to reproduce multiple retrieved contexts",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "RAG integration substantially reduces LLMs' tendency to output memorized training data compared to using the LLM alone, noise injection, or system prompts",
    395       "evidence": "Table 3: targeted email extraction drops from 245 (None) to 2–4 with diverse retrieval; prefix attack reconstructions drop from 213 to 34 (RAG-ChatDoctor) and 70 (RAG-Wikitext)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Using non-sensitive public data as RAG retrieval content minimizes training data leakage risks, while similar-domain data increases leakage from that domain",
    400       "evidence": "Table 3 shows near-zero PII extraction with wikitext-103 or HealthcareMagic retrieval, but 20–66 PIIs leaked when using W3C-Email (similar distribution to training data)",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "observational"
    407   ],
    408   "key_findings": "RAG systems are highly vulnerable to retrieval data extraction via composite structured prompting attacks, with substantial medical records and PII extractable through black-box API access. Standard defenses — re-ranking provides virtually no protection, while abstractive summarization reduces untargeted attack success by ~50% but is ineffective against targeted attacks, and a distance threshold creates a privacy-utility tradeoff. Conversely, RAG integration substantially reduces LLMs' tendency to reproduce memorized training data, outperforming noise injection and system prompt defenses and reducing successful training data reconstructions from 200+ to fewer than 40 in prefix attack experiments.",
    409   "red_flags": [
    410     {
    411       "flag": "Training contamination of Enron data unaddressed",
    412       "detail": "The Enron Email dataset is widely used and likely present in GPT-3.5-turbo and Llama training data; the paper does not discuss whether RQ1 'retrieval leakage' results could partly reflect training data memorization rather than retrieval data extraction, which would be a fundamental confound."
    413     },
    414     {
    415       "flag": "No statistical testing or variance",
    416       "detail": "All results are raw counts from single-run experiments with no confidence intervals, significance tests, or variance estimates, making it impossible to assess reliability or effect significance."
    417     },
    418     {
    419       "flag": "RQ2 limited to one small model",
    420       "detail": "Training data memorization experiments (Section 5) use only GPT-Neo-1.3B due to the requirement for known training data, severely limiting generalizability to the larger commercial models tested in RQ1."
    421     },
    422     {
    423       "flag": "Model versions unspecified",
    424       "detail": "GPT-3.5-turbo is used without a specific snapshot date, making results potentially irreproducible as the underlying model changes; Llama model versions lack specific checkpoint hashes."
    425     },
    426     {
    427       "flag": "Sample size unjustified",
    428       "detail": "250 prompts for attack experiments (5000 for memorization) are used without any justification for these choices, power analysis, or discussion of whether results would change with more prompts."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Extracting training data from large language models",
    434       "relevance": "Foundational work on training data extraction attacks that this paper extends to the RAG retrieval data context; the untargeted attack strategy builds directly on this method"
    435     },
    436     {
    437       "title": "Quantifying memorization across neural language models",
    438       "relevance": "Provides the prefix attack methodology used in Section 5.3 to quantify training data memorization with and without RAG"
    439     },
    440     {
    441       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    442       "relevance": "Original RAG paper establishing the technique whose privacy implications this paper evaluates"
    443     },
    444     {
    445       "title": "Privacy implications of retrieval-based language models",
    446       "relevance": "Closely related concurrent work on privacy in kNN-LM; paper explicitly distinguishes its contribution from this different retrieval-based architecture"
    447     },
    448     {
    449       "title": "Deduplicating training data mitigates privacy risks in language models",
    450       "relevance": "Prior work establishing that data duplication increases memorization risk, providing baseline context for understanding LLM privacy vulnerabilities"
    451     },
    452     {
    453       "title": "Emergent and predictable memorization in large language models",
    454       "relevance": "Examines factors affecting memorization risk (model size, duplication), providing context for interpreting the memorization reduction findings in Section 5"
    455     },
    456     {
    457       "title": "An explanation of in-context learning as implicit Bayesian inference",
    458       "relevance": "Provides the theoretical Bayesian framework used in Section 5.4 to explain why RAG reduces training data leakage via conditional distribution shift"
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 3,
    464       "justification": "Directly actionable for anyone building RAG systems with sensitive data; provides concrete attack methods and tests specific defenses with clear deployment implications."
    465     },
    466     "surprise_contrarian": {
    467       "score": 2,
    468       "justification": "The finding that RAG reduces training data leakage is counterintuitive and challenges the assumption that RAG only introduces new privacy risks."
    469     },
    470     "fear_safety": {
    471       "score": 3,
    472       "justification": "Demonstrates that medical records and personal emails can be extracted from RAG systems at high rates using simple prompting, with concrete examples shown in Table 15."
    473     },
    474     "drama_conflict": {
    475       "score": 2,
    476       "justification": "The 'good and bad' framing creates a compelling narrative tension between RAG as privacy threat and RAG as privacy protection mechanism."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "Code is publicly released and uses publicly available datasets, making the attack demonstration replicable by practitioners with standard resources."
    481     },
    482     "brand_recognition": {
    483       "score": 1,
    484       "justification": "Authors from MSU, Baidu, and Jilin University; published at ACL but no top-tier AI lab affiliation."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [
    489       {
    490         "hn_id": "39605484",
    491         "title": "PromptSet: A Programmer's Prompting Dataset",
    492         "points": 2,
    493         "comments": 0,
    494         "url": "https://news.ycombinator.com/item?id=39605484"
    495       },
    496       {
    497         "hn_id": "42618502",
    498         "title": "New Prospects for a Causally Local Formulation of Quantum Theory",
    499         "points": 1,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=42618502"
    502       }
    503     ],
    504     "top_points": 2,
    505     "total_points": 3,
    506     "total_comments": 0
    507   }
    508 }

Impressum · Datenschutz