scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34647B)
      1 {
      2   "paper": {
      3     "title": "VORTEXPIA: Indirect Prompt Injection Attack against LLMs for Efficient Extraction of User Privacy",
      4     "authors": [
      5       "Yu Cui",
      6       "Sicheng Pan",
      7       "Yifei Liu",
      8       "Haibin Zhang",
      9       "Cong Zuo"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2510.04261",
     14     "doi": "10.48550/arXiv.2510.04261"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "VORTEXPIA is an indirect prompt injection attack that uses fabricated 'false memories' to induce LLMs to proactively request PII from users under black-box settings. Evaluated on 6 LLMs across 4 datasets, it achieves up to 90.9% ASR (on Qwen2.5-72B) and ~2.37× improvement over baselines while using 54% fewer tokens. The paper finds that models with stronger reasoning abilities are more vulnerable, and that reasoning LLMs' final answers are less secure than their reasoning tokens — contradicting prior findings on adversarial attacks.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper. The injected prompts are given in Appendix A.1 but no reproducible codebase is released."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses publicly available benchmark datasets: MATH500 (Lightman et al., 2024), AIME2024, AIME2025, and AICrypto (Wang et al., 2025c). The core injected prompts are provided in Appendix A.1."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specifications, dependency lists, or setup instructions are provided. The paper states 'All models are queried via APIs' (Appendix A.5) but provides no details about the software environment used to run experiments."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are included. The paper provides attack prompt templates in the appendix but no scripts, configuration files, or procedures to replicate the experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The main ASR results in Figures 3 and 4 report only point estimates with no error bars or confidence intervals. Tables 1 and 2 show ± values, but these represent variance across the evaluator group (two LLM judges), not confidence intervals or experimental uncertainty."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used. Claims that VORTEXPIA 'significantly outperforms baselines' (Section 4.2) and achieves '2.37× improvement' (Figure 4) are based solely on comparing point estimates without any statistical test."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are reported in context: '2.37× improvement' over baselines (Section 4.2), '31% improvement' on real-world applications (Table 1), '53.98% reduction' in token consumption (Figure 6), and '27% relative reduction' in PR compared to Baseline 3 (Section 4.2)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Appendix A.5 states '150 samples from MATH500 and 118 samples from AICrypto' with no justification for these numbers. Full AIME2024 and AIME2025 datasets are used (30 problems each) but no power analysis or sample size rationale is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The ± values in Tables 1 and 2 represent variance across the two-member evaluator group (Qwen2.5-72B and DeepSeek-V3.1), not variance across experimental runs. The main ASR results in Figures 3 and 4 are single-run point estimates with no indication of result stability across repeated experiments."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Three baseline attack methods from Zhan et al. (2025b) are included: Direct CAI (Baseline 1), Reciprocal CAI (Baseline 2), and User-benefits CAI (Baseline 3). A fourth conventional PIA baseline is included for PR evaluation (Section 4.1)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All baselines are from Zhan et al. (2025b), which is the most recent and directly relevant prior work on LLM-based privacy extraction. The paper positions User-benefits CAI as the current SOTA (Table 1)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No ablation study is presented. The paper tests different Np values (6, 10, 13) for the privacy set size (Figure 5), which is a parameter sensitivity analysis, but does not ablate individual components of the attack design (e.g., false memories, no-disclosure instruction, batch requesting)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three evaluation metrics are defined and used: ASR (attack success rate), MR (matching rate between requested and specified privacy items), and PR (positive detection rate). Token consumption is also reported as a cost metric (Figure 6)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 3.4 states that for MR evaluation, 'human annotators further validate the results for accuracy' after the LLM evaluator performs the initial assessment. However, human evaluation is limited to MR validation only; ASR and PR rely entirely on LLM-as-a-Judge."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No explicit separation between development and test data is described. It is unclear whether the attack prompts (Appendix A.1) were designed and tuned using any of the four evaluation datasets. No dev/test split is mentioned."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down per dataset (MATH500, AIME2024, AIME2025, AICrypto) and per model (6 LLMs) in Figure 3. Table 1 provides per-application breakdowns. Figure 5 shows MR across different Np values and datasets."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.2 discusses failure modes: 'VORTEXPIA performs poorly on Qwen3-32B when tested on the challenging mathematical datasets' because the model 'sometimes ignores the task of requesting private data after solving the mathematical problem.' The paper also shows DeepSeek-R1's 'stuck' states in detail (Appendix A.6)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that VORTEXPIA underperforms on Qwen3-32B for hard math datasets (Section 5.2). Figure 3c shows VortexPIA scoring lower than Baseline 1 on AICrypto for Qwen3-32B. DeepSeek-R1 results show relatively lower ASR compared to some baselines on certain datasets (Figure 3e)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims of SOTA performance are supported by Figure 4 (~2.37× average improvement). Reduced token consumption is supported by Figure 6 (53.98% reduction). Enhanced robustness against defenses is supported by Table 2 (PR of 45% vs 61% for the SOTA baseline). The claim of practical effectiveness is supported by Table 1 (31% improvement on real-world apps)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "Section 5.1 claims 'as the reasoning ability of the targeted LLM increases, the severity of the exposed privacy threat also escalates.' This causal claim is based on observational comparison across models that differ in multiple dimensions (size, architecture, training). Reasoning ability is confounded with model size (32B to 1000B parameters) and training methodology."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims the attack works 'against LLMs' generally, and the abstract claims it works 'under black-box settings.' However, testing is limited to 6 LLMs (all Chinese-developed or open-source), 4 datasets (3 math + 1 crypto), and 2 real-world applications. No GPT, Claude, or Gemini models are tested. The generalization to other application domains beyond math/crypto queries is not bounded."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Section 5 analyzes the attack mechanism (reasoning 'stuck' states) but does not consider alternative explanations for the results. For example, the high ASR on Qwen2.5-72B could be due to weaker safety training rather than stronger reasoning. The paper does not discuss whether results could be explained by model-specific safety alignment differences rather than general reasoning capability."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "ASR is measured by LLM-as-a-Judge scoring of whether responses contain PII requests, which is a proxy for actual user privacy risk. The paper does not discuss the gap between LLM-judged ASR and real-world privacy harm. Whether users would actually disclose PII in response to these requests is cited from prior work (Zhan et al., 2025b) but not validated for VORTEXPIA's specific attack outputs."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Models are specified as Qwen2.5-32B-Instruct, Qwen2.5-72B-Instruct, Qwen3-32B, DeepSeek-V3, DeepSeek-R1-0528, and Kimi-K2-Instruct. Most include sufficient version information (family + version + size + variant). DeepSeek-R1-0528 includes a date suffix. DeepSeek-V3 lacks a snapshot date but references the specific technical report."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix A provides the actual prompt text for: the core injected prompt of VORTEXPIA (A.1), the ASR evaluator system prompt (A.2), the MR evaluator system prompt (A.2), the prevention system prompt (A.3), and the detection system prompt (A.4). These are complete, usable prompt texts."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Section 4.1 states only 'All models use default values for parameters, such as temperature.' The actual default values (temperature, top-p, max tokens) are not specified. Defaults vary across providers and can change over time, so this is insufficient for reproduction."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "The core attack is a simple prompt injection appended to external data — no agentic scaffolding is used. The real-world applications (DeepSearch, LongTermMemory) are evaluated as black-box third-party systems whose internal scaffolding the authors cannot be expected to describe."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Appendix A.5 mentions '150 samples from MATH500 and 118 samples from AICrypto' and 'We exclude data instances that cause model execution to fail,' but does not specify how many instances were excluded, what constitutes execution failure, or how the 150/118 subsets were selected from the full datasets."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 7 'Limitations and Ethical Considerations' is present. It acknowledges dataset coverage limitations and states the work is 'intended solely for scientific research purposes.'"
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "Section 7 mentions only generic limitations: 'Due to resource limitations, we could not assess its performance on additional datasets' and plans for future work on defense mechanisms. No specific threats to validity of the current results are discussed (e.g., LLM-as-a-Judge reliability, evaluator bias, model selection bias toward Chinese-developed LLMs)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show — for example, that results may not generalize to closed-source commercial models (GPT-4, Claude), non-STEM query domains, or real user interactions as opposed to automated benchmarks."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw data (model responses, evaluator scores, individual ASR/MR/PR values per instance) is released. Only aggregated results in figures and tables are available."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The data collection procedure is described: user queries from benchmark datasets are appended with injected malicious data and sent to LLMs via APIs (Section 3.2, Algorithm 1). Evaluator LLMs then score responses according to defined metrics (Section 3.4)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants are recruited for the main study. Data sources are standard public benchmarks (MATH500, AIME2024, AIME2025, AICrypto). Human annotators are mentioned for MR validation but are not study participants."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The high-level pipeline is described (inject data → query LLM → evaluate response), but intermediate steps lack detail. How many instances were excluded due to execution failure is not stated. The evaluator scoring process uses two LLM judges but aggregation details beyond averaging are not documented."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source, acknowledgments section, or grant information is present anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Beijing Institute of Technology (Cui, Pan, Liu, Zuo) and Yangtze Delta Region Institute of Tsinghua University (Zhang). These are academic institutions with no apparent conflict with the evaluated products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement does not establish that no funding exists."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "The paper tests an attack method's ability to induce privacy-requesting behavior, not the models' capability on any benchmark. The math/crypto datasets serve only as benign user query context. Whether models have seen the benchmark data is not relevant to the attack's success metric (ASR)."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Same rationale: the paper evaluates attack effectiveness, not model knowledge. Train/test overlap for the math benchmarks does not affect whether the model follows injected instructions to request PII."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Benchmark contamination is not relevant since the benchmarks measure attack success rate, not model problem-solving capability. The attack's effectiveness depends on instruction-following and safety alignment, not benchmark memorization."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. The study evaluates automated prompt injection attacks against LLMs using benchmark datasets."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study involves only automated interactions between attack prompts and LLM APIs."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Figure 6 reports the token consumption of the attack payload: VORTEXPIA uses 77 tokens vs. 89-243 tokens for baselines, a 53.98% reduction. This is the most relevant cost metric for an attack method (cost to mount the attack)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget is stated. The paper does not report total API calls made, total tokens processed across all experiments, wall-clock time, or total API spend."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of random seeds or sensitivity analysis across seeds. Results appear to be from single runs with no assessment of result stability."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never explicitly stated. The ± values in Tables 1 and 2 appear to represent evaluator variance (across 2 LLM judges), not multiple experimental runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The attack prompt design appears fixed with no discussion of alternative configurations tried or how the final prompt was selected."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Only the final VORTEXPIA prompt is presented. No discussion of alternative prompt designs, how this particular formulation was chosen, or what design iterations were explored."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is moot."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement the baselines from Zhan et al. (2025b) themselves and compare against their own attack. No discussion of potential bias from implementing competing methods. The original authors' implementations are not referenced or compared."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Figure 6 shows token consumption differences but does not present performance as a function of compute budget. The methods use different token amounts but performance is not normalized by compute cost."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "No discussion of whether ASR as measured by LLM-as-a-Judge actually captures real privacy risk. The paper does not question whether the benchmark setup (automated queries with math problems) reflects realistic attack scenarios or whether LLM judges reliably assess privacy extraction risk."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "The main experiments directly query LLMs via APIs with no scaffolding. The real-world application experiments evaluate DeepSearch and LongTermMemory as bundled products — the scaffold IS the thing being tested."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Not discussed. While the benchmarks serve as benign query context rather than evaluation targets, the paper notes that ASR varies by dataset difficulty (Section 4.2), suggesting model familiarity with benchmark problems could affect attack dynamics. This confound is not addressed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. The evaluation setup appends injected data directly to user queries, and whether the query structure provides cues that affect attack success is not analyzed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. Problems within each dataset (e.g., MATH500) may share structural similarities that systematically affect attack success rates, but independence of test instances is not addressed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is applied. The paper does not use temporal splits, canary strings, or other techniques to assess whether data leakage affects results."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "VORTEXPIA achieves approximately 2.37× average ASR improvement over existing attack methods across six LLMs.",
    371       "evidence": "Figure 4 shows average ASR across all datasets and models. VORTEXPIA achieves the highest average ASR on 5 of 6 models (38.65-90.91%) compared to baselines (0.05-43.13%). Section 4.2 states the improvement is 'about 2.37×'.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "VORTEXPIA achieves 90.9% ASR on Qwen2.5-72B, the highest among all models tested.",
    376       "evidence": "Figure 4 shows Qwen2.5-72B achieves 90.91% ASR with VORTEXPIA. However, this is a single point estimate with no confidence interval, from LLM-judged evaluation.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "VORTEXPIA reduces token consumption of injected data by 53.98% compared to baselines.",
    381       "evidence": "Figure 6 directly measures token counts: VORTEXPIA uses 77 tokens vs. baselines at 89, 243, and 170 tokens. This is a straightforward count.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "VORTEXPIA improves over SOTA by 31% on real-world LLM-integrated applications.",
    386       "evidence": "Table 1 shows average ASR of 86.12% for VORTEXPIA vs 65.86% for User-benefits CAI across two applications (DeepSearch, LongTermMemory) evaluated by two LLM judges. Only two applications are tested.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Models with stronger reasoning abilities pose greater privacy risks under VORTEXPIA.",
    391       "evidence": "Section 5.1 observes that RLLMs (DeepSeek-R1, Qwen3-32B) show higher ASR than smaller traditional LLMs (Qwen2.5-32B). However, reasoning ability is confounded with model size, architecture, and safety training. No controlled comparison isolates reasoning as the causal factor.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "Final answers of reasoning LLMs are less secure than their reasoning tokens under VORTEXPIA.",
    396       "evidence": "Section 5.2 and Appendix A.6 show DeepSeek-R1's reasoning tokens explicitly reject privacy requests ('We must not request private information') while the final answer still contains PII queries. Qualitative examples are provided but no systematic quantification across all instances.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "VORTEXPIA achieves a PR of 45%, demonstrating robustness against detection-based defenses.",
    401       "evidence": "Table 2 shows VORTEXPIA achieves average PR of 44.88% vs. 61.44% for the SOTA baseline (User-benefits CAI). However, Baseline 2 (Reciprocal CAI) achieves lower PR (33.69%) while having weaker attack performance.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No statistical significance tests",
    408       "detail": "All comparative claims ('significantly outperforms,' '2.37× improvement') are based on comparing point estimates without any statistical test (t-test, bootstrap, etc.). With no significance testing, observed differences could be within noise."
    409     },
    410     {
    411       "flag": "LLM-as-a-Judge evaluation reliability",
    412       "detail": "The primary metric (ASR) is evaluated by an LLM evaluator group of only two models (Qwen2.5-72B and DeepSeek-V3.1). The reliability and calibration of these judges is not validated. Notably, Qwen2.5-72B is both an attack target and an evaluator, creating a potential conflict."
    413     },
    414     {
    415       "flag": "Single-run results with no variance reporting",
    416       "detail": "The main results in Figures 3 and 4 appear to be from single experimental runs. The ± values in Tables 1 and 2 represent evaluator disagreement, not experimental variance. Result stability across repeated experiments is unknown."
    417     },
    418     {
    419       "flag": "Model selection limited to Chinese-developed LLMs",
    420       "detail": "All six evaluated models are Chinese-developed (Qwen, DeepSeek, Kimi). No Western commercial models (GPT-4, Claude, Gemini) or other open-source models (Llama, Mistral) are tested, limiting generalizability claims."
    421     },
    422     {
    423       "flag": "No code or raw data released",
    424       "detail": "Despite claims of SOTA performance, no code, raw model responses, or evaluator scores are released. Results cannot be independently verified or reproduced."
    425     },
    426     {
    427       "flag": "Reasoning ability claim confounded by model size",
    428       "detail": "The claim that stronger reasoning increases vulnerability (Section 5.1) is based on comparing models that differ in size (32B to 1000B), architecture, and training methodology. The causal attribution to 'reasoning ability' is not supported by the experimental design."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Malicious LLM-based conversational AI makes users reveal personal information",
    434       "authors": ["Xiao Zhan", "Juan Carlos Carrillo", "William Seymour", "Jose Such"],
    435       "year": 2025,
    436       "relevance": "Direct predecessor demonstrating system-prompt-level manipulation of CAIs to extract PII, with 90%+ user disclosure rates — the white-box attack that VORTEXPIA extends to black-box settings."
    437     },
    438     {
    439       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    440       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    441       "year": 2024,
    442       "relevance": "Foundational framework for formalizing and evaluating prompt injection attacks and defenses, which VORTEXPIA builds upon for its threat model definition."
    443     },
    444     {
    445       "title": "StruQ: Defending against prompt injection with structured queries",
    446       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    447       "year": 2025,
    448       "relevance": "Proposes a structural defense against prompt injection attacks, relevant to assessing whether VORTEXPIA can bypass structured defenses."
    449     },
    450     {
    451       "title": "Can indirect prompt injection attacks be detected and removed?",
    452       "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui", "Yufei He", "Yue Liu", "Yangqiu Song", "Bryan Hooi"],
    453       "year": 2025,
    454       "relevance": "Studies detection and removal of indirect prompt injections; VORTEXPIA's detection evaluation builds on this work's detection model approach."
    455     },
    456     {
    457       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    458       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    459       "year": 2025,
    460       "arxiv_id": "2310.16620",
    461       "relevance": "Demonstrates that adaptive attacks can break PIA defenses on LLM agents, providing the instructional prevention defense used in VORTEXPIA's evaluation."
    462     },
    463     {
    464       "title": "Breaking agents: Compromising autonomous LLM agents through malfunction amplification",
    465       "authors": ["Boyang Zhang", "Yicong Tan", "Yun Shen", "Ahmed Salem", "Michael Backes", "Savvas Zannettou", "Yang Zhang"],
    466       "year": 2024,
    467       "arxiv_id": "2407.20859",
    468       "relevance": "Shows PIAs remain effective in multi-agent systems, extending the threat model relevant to VORTEXPIA's evaluation on multi-agent applications like DeepSearch."
    469     },
    470     {
    471       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    472       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    473       "year": 2025,
    474       "relevance": "Provides benchmarks and defense evaluation frameworks for indirect prompt injection, directly relevant to the survey's coverage of LLM security evaluation methodology."
    475     },
    476     {
    477       "title": "PIG: Privacy jailbreak attack on LLMs via gradient-based iterative in-context optimization",
    478       "authors": ["Yidan Wang", "Yanan Cao", "Yubing Ren", "Fang Fang", "Zheng Lin", "Binxing Fang"],
    479       "year": 2025,
    480       "relevance": "Privacy-specific jailbreak attack using gradient-based optimization, representing a complementary approach to VORTEXPIA's prompt injection method for LLM privacy extraction."
    481     },
    482     {
    483       "title": "Searching for privacy risks in LLM agents via simulation",
    484       "authors": ["Yanzhe Zhang", "Diyi Yang"],
    485       "year": 2025,
    486       "arxiv_id": "2508.10880",
    487       "relevance": "Explores privacy risks from malicious LLM agents proactively extracting PII through multi-turn interactions, closely related to VORTEXPIA's privacy extraction threat model."
    488     },
    489     {
    490       "title": "The hidden risks of large reasoning models: A safety assessment of R1",
    491       "authors": ["Kaiwen Zhou", "Chengzhi Liu", "Xuandong Zhao"],
    492       "year": 2025,
    493       "relevance": "Safety assessment of reasoning LLMs finding that reasoning tokens show stronger threats than final answers — VORTEXPIA's finding contradicts this for privacy extraction attacks."
    494     },
    495     {
    496       "title": "Optimization-based prompt injection attack to LLM-as-a-judge",
    497       "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu", "Yue Huang", "Pan Zhou", "Lichao Sun", "Neil Zhenqiang Gong"],
    498       "year": 2024,
    499       "relevance": "Demonstrates prompt injection attacks targeting LLM-as-a-Judge evaluation, relevant to assessing the reliability of VORTEXPIA's own LLM-judged evaluation methodology."
    500     },
    501     {
    502       "title": "Fun-tuning: Characterizing the vulnerability of proprietary LLMs to optimization-based prompt injection attacks via the fine-tuning interface",
    503       "authors": ["Andrey Labunets", "Nishit V. Pandya", "Ashish Hooda", "Xiaohan Fu", "Earlence Fernandes"],
    504       "year": 2025,
    505       "relevance": "Characterizes prompt injection vulnerabilities through the fine-tuning interface of proprietary LLMs, extending the attack surface relevant to LLM security research."
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "Security practitioners could use the attack template to test their LLM applications for privacy extraction vulnerabilities, but no tool or code is released."
    512     },
    513     "surprise_contrarian": {
    514       "score": 1,
    515       "justification": "Extends known prompt injection threats to privacy extraction under black-box settings; novel combination but not a fundamental surprise given prior work on PIA."
    516     },
    517     "fear_safety": {
    518       "score": 3,
    519       "justification": "Demonstrates a practical attack achieving 90%+ ASR that causes deployed LLM applications to proactively solicit users' PII including bank PINs and passwords."
    520     },
    521     "drama_conflict": {
    522       "score": 1,
    523       "justification": "No major controversy or adversarial framing beyond standard security research; the finding that reasoning models are more vulnerable adds mild tension."
    524     },
    525     "demo_ability": {
    526       "score": 0,
    527       "justification": "No code, demo, or tool is released; the attack prompts are provided in the appendix but cannot be trivially reproduced without the full experimental setup."
    528     },
    529     "brand_recognition": {
    530       "score": 1,
    531       "justification": "Tests well-known models (DeepSeek-R1, Qwen) but from an unknown research group at Beijing Institute of Technology; no major lab or product branding."
    532     }
    533   }
    534 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs