ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (35984B)


      1 {
      2   "paper": {
      3     "title": "Prompt Injection Vulnerability of Consensus Generating Applications in Digital Democracy",
      4     "authors": [
      5       "Jairo Gudiño-Rosero",
      6       "Clément Contet",
      7       "Umberto Grandi",
      8       "César A. Hidalgo"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2508.04281",
     13     "doi": "10.48550/arXiv.2508.04281"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Default LLMs (LLaMA 3.1 8B, GPT-4.1 Nano, Apertus 8B) are broadly vulnerable to prompt injection in consensus generation tasks, with ASR of 26–39% across policy domains. Vulnerability is asymmetric across political parties, with conservative/unionist parties showing 41–80% ASR. Rational, instruction-like rhetorical strategies (imperative orders, impossibility of agreement) are most effective (35–52% ASR) compared to emotional appeals or fabricated statistics (25–32%). A defense pipeline combining injection detection (GPT-OSS-SafeGuard), structured opinion representations, and GSPO reduces ASR to near zero for non-ambiguous consensus outcomes.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. No mention of code availability."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper uses data from Tessler et al. (2024) and Stammbach et al. (2024). The 40 injection text variants are listed in Appendix A tables, but the processed datasets (35,521 consensus pairs, adversarial prompts, BERT classifications) are not released."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment, or library versions are provided. The paper mentions using vLLM, TRL, Unsloth, and spaCy but does not specify versions."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "All ASR values are reported with 95% confidence intervals estimated via bootstrapping with 5,000 iterations. Stated throughout results and figure captions: 'All ASR values and their 95% confidence intervals are estimated via bootstrapping with 5,000 iterations.'"
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Bootstrap confidence intervals are used for all comparisons across parties, policy clusters, and rhetorical strategies. While formal hypothesis tests (t-tests, etc.) are not named, the 95% bootstrap CIs provide equivalent inferential power for the claims made."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "ASR values are reported as percentages with full context (e.g., '26% and 39%' for policy clusters, '41% to nearly 80%' for conservative parties vs 'below 27%' for others). The reduction from baseline ASR to post-defense ASR is reported (e.g., '1.8%–11.8%' after filtering, '0%–1.9%' after GSPO)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "62 out of 301 policy questions were randomly sampled for the test set with no justification for why 62 was sufficient. No power analysis is discussed. The total of 35,521 consensus pairs is large but the underlying question diversity is limited to 62 questions."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Variance is captured via bootstrap 95% CIs across all main results (Figures 4, 9, 10, 12–16). The bootstrapping procedure over 5,000 iterations provides spread measures for all reported statistics."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Multiple baselines are included: (1) attack-free consensus as vulnerability baseline, (2) DPO+GSPO as a defense baseline (Appendix L), (3) Deliberative Alignment as another defense baseline (Appendix M). Defense pipeline stages are also compared against each other (Appendix K)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Defense baselines include DPO (Rafailov 2023), Deliberative Alignment (Guan 2024), and the full pipeline uses GPT-OSS-SafeGuard (Agarwal 2025) and GSPO (Zheng 2025). These represent current state-of-the-art defense strategies."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Appendix K decomposes defense pipeline performance by stage: detection+filtering alone, then structured representations, then GSPO. Each stage's contribution to ASR reduction is separately measured (filtering: 1.8–11.8% ASR; GSPO: 0–1.9% ASR)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Primary metric is ASR from 3×3 confusion matrices. Additional metrics include: over-refusal rate (Appendix N, 1.2–3.7% after GSPO), injection detection rates for three detectors (Appendix F/Figure 8), BERT classifier F1/accuracy (Appendix B), and ASR computed on reduced 2×2 matrices excluding ambiguous outcomes."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation is automated. Consensus statement valence is classified entirely by a fine-tuned BERT model (F1=0.98). No human judges evaluated the quality, fairness, or valence of generated consensus statements."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.3: 'we treat the random sample of 62 policy questions analyzed in the previous section as our test set, and use the remaining prompts associated with 239 policy questions as our training set.' Explicit train/test separation by policy question."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down by 8 public policy clusters (Figure 4 top), 15 UK political parties (Figure 4 right), party×cluster intersections (Figure 4 heatmap), 5 rhetorical strategies (Figure 10), ignore/completion framing, support/criticism framing, and 3 LLM models (Appendix H)."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Systematic failure patterns are discussed: conservative/unionist parties show ASR up to 80%, ambiguous outcomes inflate ASR dramatically when included (Appendix K), machine-readable attacks remain unaddressed, and adaptive attackers may evade detection (citing Nasr et al. 2026). Section 6 explicitly states 'more sophisticated future attacks will likely reveal additional vulnerabilities.'"
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Emotional appeals and misleading statistics are reported as less effective attack strategies (25–32% ASR vs 35–52% for imperative/impossibility in Figure 10). DPO+GSPO baseline achieves worse defense (3.2–23% ASR, Appendix L) than the full pipeline. Including ambiguous outcomes dramatically worsens defense performance (Appendix K). Over-refusal rate is 1.2–3.7% (Appendix N)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: 'widespread vulnerability' backed by 26–39% ASR across topics (Section 5.1), 'high ASR for conservative parties' backed by 41–80% (Figure 4a), 'rational instruction-like strategies' most effective backed by Figure 10 (35–52%), 'near-zero ASR' after defense backed by Figure 4b and Appendix K."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims ('prompt injection shifts consensus') are justified by a controlled manipulation design: identical prompts are tested with and without injections, and the resulting valence changes are measured. The study uses a within-subjects design where each prompt serves as its own control, isolating the injection effect."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Claims are reasonably bounded: 'off-the-shelf consensus-generating LLMs' (3 specific models named), 'human-readable prompt injections' (Section 3, excluding machine-readable), UK policy data context. The Swiss SmartVote replication (Appendix J) extends generalizability. Section 1 frames the work as 'a way to outline a few key ingredients for their study' rather than a definitive solution."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Multiple alternative explanations are addressed: baseline LLM performance confound (filtered by majority-rule matching, Section 4.2), order effects (20 random orderings per prompt), BERT classifier accuracy (F1=0.98 validated, Appendix B), adaptive attack evasion (Section 4.3, citing Nasr et al. 2026), and structured representation trade-offs (hallucination, nuance attenuation)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures valence shift (agree/disagree/ambiguous) via BERT classifier as a proxy for consensus manipulation. This proxy is well-calibrated: the BERT classifier achieves F1=0.98 (Appendix B). The paper acknowledges the proxy's limitations by separately analyzing ambiguous vs non-ambiguous outcomes and noting that 'small perturbations can easily flip intrinsically uncertain cases' (Section 5.2)."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "'LLaMA 3.1 8B Instruct' is specific, but 'GPT-4.1 Nano' and 'GPT-4.1 Mini' lack snapshot dates or API version identifiers. 'DeepSeek-R1' (Appendix M) and 'GPT-4o' (BERT labeling) also lack version specifics. Per schema rules, marketing names without snapshot dates do not count."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full prompt texts are provided: consensus generation prompt in Figure 1a, all 40 injection texts in Appendix A (Tables 1–6), GSPO fine-tuning prompt in Appendix G, DPO+GSPO prompt in Appendix L, GPT-OSS-SafeGuard detection prompt in Appendix F, and political party stance extraction prompt in Appendix E."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Comprehensive hyperparameters reported: GSPO (LoRA r=32, α=64, ε=0.2, β=0.05, temperature=0.7, 4,500 steps, K=6 candidates); DPO (r=8, α=8, dropout=0.1, lr=5e-6, β=0.5, weight decay=0.2); BERT (lr=2e-5, batch=16, 5 epochs, weight decay=0); inference temperature=0.0 for DPO (Appendix L)."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The system is a sequential pipeline (detection → structured representation → GSPO-aligned generation) without tool use, memory, retry logic, or agent loops."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Full pipeline documented: 462 prompts → 301 'Should...?' questions (trade-off questions excluded) → 6,020 variants (20 orderings each) → 62 test questions sampled → BERT opinion classification → 80 adversarial variants per prompt → consensus generation → BERT consensus classification → majority-rule filtering → 35,521 consensus pairs. Counts at each stage with filtering criteria stated."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 6 ('Conclusion and Future Work') contains substantive limitations discussion: defense limitations with adaptive attacks, inability of empirical evaluations to guarantee long-term robustness, trade-offs of structured representations (nuance attenuation, hallucination), and multiple future research directions addressing identified gaps."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats discussed: (1) 'structured summaries can attenuate expressive nuance' (Section 4.3), (2) 'LLM-generated justifications may hallucinate or introduce subtle framing effects' (Section 4.3), (3) 'Adaptive attackers may design inputs that evade publicly known detectors' (Section 4.3, citing Nasr et al. 2026), (4) machine-readable attacks 'remain largely unaddressed by detection-based strategies' (Section 4.3), (5) ambiguous outcomes still highly vulnerable (Section 5.2)."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Explicit scope boundaries: 'we focus on human-readable prompt injections' (Section 3), 'prompt injections as one case of user-level attacks' (Section 1), 'not as a way to provide a definitive solution' (Section 1), defense results 'when restricting attention to non-ambiguous consensus outcomes' (Section 5.2). Future work identifies specific untested configurations (human-in-the-loop, local-to-global aggregation)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The processed adversarial datasets (35,521 consensus pairs, BERT classifications, consensus outputs) are not released. The source dataset from Tessler et al. (2024) published in Science may be separately available but the authors do not provide a download link for their own data."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 4.1 describes the Tessler et al. 2024 experiment: 1,034 UK participants, groups of ~5, 20-minute deliberation sessions on public policy questions, opinion writing phase data used. The SmartVote validation uses 26,502 comments from Swiss parliamentary candidates (Appendix J, citing Stammbach et al. 2024)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper references the Tessler et al. (2024) study as the data source but does not describe how the original 1,034 participants were recruited, what channels were used, or whether recruitment could introduce bias. For the SmartVote data, recruitment is only described as 'candidates running for the national parliament in Switzerland.'"
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline from collection to analysis is documented with counts at each stage: 462 prompts → 301 'Should' questions → 6,020 variants → 62 test/239 train split → BERT classification → 80 adversarial alternatives per prompt → consensus generation → valence classification → majority-rule filtering → 35,521 pairs (17,403 support, 18,049 criticism, 17,761 completion, 17,761 ignore)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Section 7 lists funding: ERC grant 101166894 'ADDI', EU LearnData GA 101086712, IAST/ANR grant ANR-17-EURE-0010, European Lighthouse of AI for Sustainability grant 101120237."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All author affiliations are listed: Université de Toulouse, Center for Collective Learning (IAST/TSE), Université Toulouse Capitole, IRIT, Corvinus University of Budapest, University of Manchester. No authors are affiliated with the companies producing the evaluated models (Meta, OpenAI, Apertus consortium)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding is from EU/ERC grants for digital democracy research and AI sustainability. These funders have no financial stake in whether LLMs are found vulnerable or robust to prompt injection attacks."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper. Absence of a disclosure statement does not confirm absence of conflicts."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests defenses against prompt injection attacks rather than evaluating model knowledge on a benchmark. The core evaluation measures vulnerability to adversarial manipulation, not pre-trained capability. Per schema guidelines, contamination items are NA for studies that test defenses rather than model knowledge."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same rationale: the paper tests attack/defense dynamics, not model knowledge on benchmarks. Train/test overlap in the traditional contamination sense is not applicable."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Same rationale: the paper evaluates prompt injection vulnerability, not pre-trained model performance on a knowledge benchmark."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "The paper does not involve new human participants. It uses secondary data from Tessler et al. (2024) and automated LLM experiments."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No new human participants are involved. The paper analyzes existing data from a prior published study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No new human participants are involved in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No new human participants are recruited for this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No new human participants or experimental conditions involving humans."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants or evaluators involved in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants involved in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference costs, API costs, tokens consumed, or wall-clock times are reported despite generating tens of thousands of consensus statements across multiple models and running extensive bootstrapping analyses."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper mentions models 'capable of running on a single A100 GPU' (Section 4.2) and states 4,500 GSPO gradient steps, but no total GPU hours, API spend, or training time is quantified."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No random seed sensitivity analysis is reported. LLM generation uses temperature 0.7 for GSPO and 0.0 for DPO inference, but results are not reported across multiple seeds."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of LLM generation runs per prompt is not explicitly stated. Each of the 6,020 prompt variants appears to receive one generation, but this is not confirmed. Bootstrap iterations (5,000) are for statistical analysis, not LLM generation runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "For BERT fine-tuning, the paper states 'After several experiments, the highest accuracy was reached with a learning rate of 2e-5, a batch size of 16, 5 epochs' but does not specify the number of configurations tried. GSPO hyperparameters are adopted from Pappone et al. without search."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "BERT hyperparameters were selected as the 'highest accuracy' configuration from 'several experiments' but the selection procedure (validation set, criterion) is not described. k-means with k=8 was selected 'after several tests with 2, 4, 8, 16, 32, 64, 128 and 256 clusters' without stating the selection criterion."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Many comparisons are made across 15 political parties, 8 policy clusters, 5 rhetorical strategies, 3 LLMs, and ignore/completion×support/criticism combinations. No correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors design the attack taxonomy, implement the defense pipeline, and evaluate both — all within the same team. No independent evaluation is conducted and no acknowledgment of self-comparison bias is present."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The defense pipeline adds substantial compute (GPT-OSS-SafeGuard detection, structured representation generation via GPT-4.1 Nano, GSPO fine-tuning) but performance is never compared at matched compute budgets against simpler baselines."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The paper discusses whether valence-shift ASR actually measures consensus manipulation vulnerability. It separately analyzes ambiguous vs non-ambiguous outcomes, acknowledging that 'small perturbations can easily flip intrinsically uncertain cases' (Section 5.2). The BERT classifier is validated (F1=0.98). The majority-rule filtering isolates injection effects from baseline model failures (Section 4.2, Appendix D)."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is used. Models are evaluated directly on consensus generation without scaffolding."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The Tessler et al. (2024) dataset was published in Science before the training cutoffs of LLaMA 3.1, GPT-4.1 Nano, and Apertus 8B. The opinion texts and policy questions could appear in training data, potentially affecting how models handle these specific topics. This is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. The prompt format (Figure 1a) explicitly structures the task, and models may have been trained on similar consensus-generation tasks."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "The 6,020 prompt variants are derived from only 301 unique policy questions (20 orderings each), creating high structural similarity within question groups. This non-independence is not discussed or accounted for in the statistical analysis."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination procedures are used."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Default LLMs show widespread vulnerability to prompt injection in consensus generation, with ASR between 26% and 39% across policy domains.",
    370       "evidence": "Figure 4a shows ASR by policy cluster for LLaMA 3.1 8B Instruct with bootstrap CIs. Appendix H replicates for GPT-4.1 Nano and Apertus 8B (Section 5.1).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Conservative and unionist parties show substantially higher vulnerability (41–80% ASR) compared to other parties (<27%).",
    375       "evidence": "Figure 4a right bar chart and heatmap. Reform UK, Conservative Party, and Ulster/Democratic/Traditional unionists show highest ASR. Replicated across all 3 LLMs (Appendix H, Section 5.1).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Imperative Order and Impossibility of Agreement rhetorical strategies are most effective, yielding ASR of 35–52%, compared to 25–32% for emotional language and fabricated statistics.",
    380       "evidence": "Figure 10 and Appendix I show ASR by rhetorical strategy across all 3 LLMs (Section 5.1).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The robustness pipeline (GPT-OSS-SafeGuard + structured representations + GSPO) reduces ASR to near zero across parties and policy clusters for non-ambiguous consensus outcomes.",
    385       "evidence": "Figure 4b shows post-defense ASR near 0% across nearly all party–cluster combinations. Appendix K decomposes by pipeline stage: filtering brings ASR to 1.8–11.8%, GSPO to 0–1.9% (Section 5.2).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "GPT-OSS-SafeGuard detects 99.31% of injection texts and 0% of opinion texts as attacks.",
    390       "evidence": "Section 4.3 and Figure 8 (Appendix F) compare three detectors. GPT-OSS-SafeGuard outperforms Qwen3Guard and Syntactic Dependency Parsing.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Single prompt injection attacks persist even with more than 30 contributors in consensus generation.",
    395       "evidence": "Appendix J replicates vulnerability analysis using 26,502 SmartVote candidate statements across 374 policy questions, varying group sizes from 5 to 40 (Figure 11). ASR remains above zero even with 30+ contributors.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Over-refusal rates after applying the defense pipeline range between 1.2% and 3.7%.",
    400       "evidence": "Appendix N, Figure 16 shows over-refusal rates computed from a 3×3 confusion matrix comparing original vs robust LLM in the absence of attacks.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Near-zero defense ASR conditional on excluding ambiguous outcomes",
    407       "detail": "The headline 'near-zero ASR' defense result is conditional on restricting to non-ambiguous (agree/disagree only) consensus outcomes. Appendix K shows that including ambiguous outcomes leads to 'much higher ASR estimates.' The paper acknowledges this but the abstract and main results emphasize the best-case non-ambiguous scenario."
    408     },
    409     {
    410       "flag": "All evaluation depends on automated BERT classifier",
    411       "detail": "Every evaluation metric relies on the fine-tuned BERT valence classifier (F1=0.98). No human evaluation of consensus statements is performed. Systematic BERT errors on adversarial/unusual text could propagate throughout all reported ASR values. The classifier was trained on GPT-4o labels, adding another layer of automated judgment."
    412     },
    413     {
    414       "flag": "Self-designed attacks evaluated by self-designed defenses",
    415       "detail": "The same team designed the attack taxonomy, implemented the defense pipeline, and evaluated both. As the paper itself notes (citing Nasr et al. 2026), 'adaptive attackers may design inputs that evade publicly known detectors.' No independent red-teaming or external evaluation was conducted."
    416     },
    417     {
    418       "flag": "Limited model diversity — only small/nano models tested",
    419       "detail": "Vulnerability and defense are evaluated only on LLaMA 3.1 8B Instruct, GPT-4.1 Nano, and Apertus 8B — all small models. Results may differ substantially for larger frontier models (GPT-4, Claude, etc.) which are more likely to be deployed in actual digital democracy applications."
    420     },
    421     {
    422       "flag": "No code or data release for verification",
    423       "detail": "Despite making policy-relevant claims about democratic AI vulnerabilities, no code, processed datasets, or consensus outputs are released. The 35,521 consensus pairs and defense pipeline cannot be independently verified."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "StruQ: Defending against prompt injection with structured queries",
    429       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    430       "year": 2024,
    431       "relevance": "Core prompt injection defense using structured queries, directly relevant to LLM security against injection attacks."
    432     },
    433     {
    434       "title": "SecAlign: Defending against prompt injection with preference optimization",
    435       "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"],
    436       "year": 2025,
    437       "relevance": "Defense against prompt injection using preference optimization (DPO), benchmarked against in this paper."
    438     },
    439     {
    440       "title": "Defeating Prompt Injections by Design",
    441       "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini", "D. Fabian", "C. Kern", "C. Shi", "A. Terzis", "F. Tramèr"],
    442       "year": 2025,
    443       "arxiv_id": "2503.18813",
    444       "relevance": "Design-level defenses against prompt injection, cited as a direction for stronger future guarantees."
    445     },
    446     {
    447       "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against LLM jailbreaks and prompt injections",
    448       "authors": ["M. Nasr", "N. Carlini", "C. Sitawarin", "S. V. Schulhoff", "J. Hayes"],
    449       "year": 2026,
    450       "relevance": "Demonstrates that adaptive attackers can bypass prompt injection defenses, directly motivating this paper's robustness concerns."
    451     },
    452     {
    453       "title": "NeuralExec: Learning (and learning from) execution triggers for prompt injection attacks",
    454       "authors": ["D. Pasquini", "M. Strohmeier", "C. Troncoso"],
    455       "year": 2024,
    456       "relevance": "Machine-readable prompt injection attack technique, part of the taxonomy dimension in this paper."
    457     },
    458     {
    459       "title": "Follow my Instruction and Spill the Beans: Scalable data extraction from Retrieval-Augmented Generation Systems",
    460       "authors": ["Z. Qi", "H. Zhang", "E. Xing", "S. Kakade", "H. Lakkaraju"],
    461       "year": 2025,
    462       "relevance": "Data extraction attacks on RAG systems, relevant to LLM security vulnerabilities in deployed applications."
    463     },
    464     {
    465       "title": "Better Privilege Separation for Agents by Restricting Data Types",
    466       "authors": ["D. Jacob", "E. Alghamdi", "Z. Hu", "B. Alomair", "D. Wagner"],
    467       "year": 2025,
    468       "arxiv_id": "2509.25926",
    469       "relevance": "Defense framework converting untrusted content to curated data types, directly adopted for structured opinion representations in this paper."
    470     },
    471     {
    472       "title": "How Johnny can Persuade LLMs to Jailbreak them: Rethinking Persuasion to challenge AI safety by Humanizing LLMs",
    473       "authors": ["Y. Zeng", "H. Lin", "J. Zhang", "D. Yang", "R. Jia", "W. Shi"],
    474       "year": 2024,
    475       "relevance": "Taxonomy of persuasion strategies for jailbreaking LLMs, directly informing the rhetorical strategy dimension of this paper's attack taxonomy."
    476     },
    477     {
    478       "title": "AI can help humans find common ground in democratic deliberation",
    479       "authors": ["M. H. Tessler", "M. A. Bakker", "D. Jarrett", "H. Sheahan", "M. J. Chadwick", "R. Koster", "G. Evans"],
    480       "year": 2024,
    481       "relevance": "Source dataset and Habermas Machine consensus generation system that this paper tests for prompt injection vulnerability."
    482     },
    483     {
    484       "title": "Deliberative Alignment: Reasoning enables safer language models",
    485       "authors": ["M. Y. Guan", "M. Joglekar", "E. Wallace", "S. Jain", "B. Barak"],
    486       "year": 2024,
    487       "relevance": "Defense baseline using deliberative reasoning for safer LLM outputs, benchmarked against in Appendix M."
    488     },
    489     {
    490       "title": "Direct Preference Optimization: Your language model is secretly a reward model",
    491       "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell", "C. D. Manning", "S. Ermon", "C. Finn"],
    492       "year": 2023,
    493       "relevance": "DPO alignment method used as a defense baseline component in Appendix L."
    494     },
    495     {
    496       "title": "Defending Against Prompt Injection With a Few Defensive Tokens",
    497       "authors": ["S. Chen", "Y. Wang", "N. Carlini", "C. Sitawarin", "D. Wagner"],
    498       "year": 2025,
    499       "relevance": "Defensive token approach to prompt injection defense, part of the broader defense landscape benchmarked against."
    500     },
    501     {
    502       "title": "Opportunities and risks of LLMs for scalable deliberation with Polis",
    503       "authors": ["C. T. Small", "I. Vendrov", "E. Durmus", "H. Homaei", "E. Barry", "J. Cornebise", "T. Suzman", "D. Ganguli", "C. Megill"],
    504       "year": 2023,
    505       "arxiv_id": "2306.11932",
    506       "relevance": "Study of LLMs for scalable deliberation with Pol.is platform, demonstrating both opportunities and risks of LLMs in digital democracy."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 2,
    512       "justification": "Directly relevant to teams building LLM-powered digital democracy or deliberation tools, with a concrete defense pipeline, but no released code to implement."
    513     },
    514     "surprise_contrarian": {
    515       "score": 1,
    516       "justification": "LLM vulnerability to prompt injection is expected; the party-specific asymmetry (conservative parties more vulnerable) is somewhat surprising but not paradigm-shifting."
    517     },
    518     "fear_safety": {
    519       "score": 2,
    520       "justification": "Demonstrates that democratic deliberation AI systems can be manipulated by single malicious participants, raising concerns about AI in governance."
    521     },
    522     "drama_conflict": {
    523       "score": 1,
    524       "justification": "Touches on political manipulation and democratic integrity but presents findings neutrally without controversy or blame."
    525     },
    526     "demo_ability": {
    527       "score": 0,
    528       "justification": "No code, demo, or tool is released. Results cannot be reproduced or tried by others."
    529     },
    530     "brand_recognition": {
    531       "score": 1,
    532       "justification": "Uses GPT-4.1 Nano and LLaMA 3.1 (recognizable models) but the paper is from academic labs without major brand recognition."
    533     }
    534   }
    535 }

Impressum · Datenschutz