ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (17941B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Jailbreaking and Mitigation of Vulnerabilities in Large Language Models",
      6     "authors": [
      7       "Benji Peng",
      8       "Ziqian Bi",
      9       "Qian Niu",
     10       "Ming Liu",
     11       "Pohsun Feng",
     12       "Keyu Chen",
     13       "Tianyang Wang",
     14       "Lawrence K.Q. Yan",
     15       "Yizhu Wen",
     16       "Yichao Zhang",
     17       "Caitlyn Heqi Yin",
     18       "Xinyuan Song"
     19     ],
     20     "year": 2024,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2410.15236",
     23     "doi": "10.48550/arXiv.2410.15236"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The abstract claims to review attack categories (prompt-based, model-based, multimodal, multilingual) and defenses, and the paper's body does cover these categories with citations. Claims are broadly matched by content.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper makes causal-sounding claims (e.g., 'multilingual prompts can exacerbate malicious instructions') but these are unsupported by original study design; the paper's only original 'experiment' is two anecdotal prompt injections shown in Figure 3, which cannot support general causal claims.",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The conclusion states 'LLMs remain susceptible to a range of attacks' and 'merely scaling models or applying surface-level safety measures remains insufficient' without bounding these claims to specific model families, versions, or conditions tested.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper presents attack effectiveness as straightforward findings without discussing alternative explanations for why certain attacks succeed or fail, such as model-specific training differences or evaluation methodology artifacts.",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Attack Success Rate (ASR) is presented as measuring 'safety' without discussing how ASR as a binary metric may not capture real-world harm or the quality/severity of harmful outputs generated.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "There is no dedicated limitations section for the review itself. Section V.C discusses 'challenges and limitations in evaluation' for the field, not limitations of this paper's own methodology or coverage.",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No threats to the review's validity are discussed—no acknowledgment of selection bias in which papers were included, recency bias, or the unsystematic nature of the literature search.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper does not state explicit scope boundaries such as year range, specific venues, model types, or what topics are explicitly excluded from the review.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding acknowledgment or grant information appears anywhere in the paper text.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Author affiliations are listed on the first page (AppCubic, Georgia Tech, Kyoto University, Purdue, NTNU, Liverpool, HKUST, Hawaii, UT Dallas, Wisconsin, Emory).",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No funding is disclosed, so this criterion is not applicable.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement or declaration of financial interests appears in the paper; one author is affiliated with a commercial company (AppCubic) with no disclosure.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms including 'jailbreaking' (Section II.C), 'prompt engineering' (Section II.B), and 'LLMs' (Section II.A) are defined with reasonable precision in the background section.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The introduction explicitly lists four objectives: reviewing literature on attacks and defenses, identifying research gaps, exploring human-AI collaboration, and summarizing findings with future directions.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper engages extensively with prior work, citing 84 references and integrating them into taxonomies; it discusses how attacks like PAIR and GCG build on prior adversarial prompting research.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "survey": {
    127       "search_and_selection": {
    128         "search_strategy_reproducible": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "No search strategy is described anywhere in the paper; there is no mention of how the 84 cited papers were identified or selected.",
    132           "source": "haiku"
    133         },
    134         "inclusion_exclusion_explicit": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No inclusion or exclusion criteria are stated; the paper does not describe any process for deciding which papers to include or exclude from the review.",
    138           "source": "haiku"
    139         },
    140         "prisma_or_structured_protocol": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No PRISMA diagram, PRISMA checklist, or any other structured review protocol is mentioned or followed.",
    144           "source": "haiku"
    145         },
    146         "search_terms_provided": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No search terms or queries are provided anywhere in the paper.",
    150           "source": "haiku"
    151         },
    152         "databases_listed": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No databases (e.g., Google Scholar, ACM DL, IEEE Xplore, Semantic Scholar) are mentioned as sources for the literature search.",
    156           "source": "haiku"
    157         },
    158         "screening_process_documented": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "No screening process is documented; there are no counts at any stage of paper identification, screening, or inclusion.",
    162           "source": "haiku"
    163         },
    164         "review_scope_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The review scope (year range, venues, attack types covered) is not justified; it is presented as comprehensive without explaining why certain areas are included or excluded.",
    168           "source": "haiku"
    169         }
    170       },
    171       "synthesis_quality": {
    172         "conflicting_findings_acknowledged": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The paper does not systematically acknowledge conflicting findings across reviewed papers; for instance, it does not address cases where different defenses show contradictory effectiveness results.",
    176           "source": "haiku"
    177         },
    178         "quality_assessment_of_sources": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No quality rubric, risk-of-bias assessment, or structured evaluation of source papers is performed; all cited papers are treated as equally authoritative.",
    182           "source": "haiku"
    183         },
    184         "publication_bias_discussed": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "Publication bias is not discussed; the paper does not acknowledge that the literature is likely skewed toward papers demonstrating successful attacks or defenses.",
    188           "source": "haiku"
    189         },
    190         "quantitative_synthesis_present": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "The synthesis is entirely narrative; no meta-analysis, vote counting, effect size aggregation, or quantitative comparison across studies is performed.",
    194           "source": "haiku"
    195         },
    196         "recommendations_supported_by_evidence": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "The conclusion's recommendations (e.g., 'develop robust alignment techniques that instill deeper contextual understanding') are general author opinions not tightly linked to specific reviewed evidence or effect sizes.",
    200           "source": "haiku"
    201         }
    202       }
    203     }
    204   },
    205   "claims": [
    206     {
    207       "claim": "LLMs remain susceptible to diverse attacks including prompt-based, model-based, multimodal, and multilingual jailbreaking despite safety alignment efforts",
    208       "evidence": "Survey of ~84 papers demonstrating successful attacks across these categories; Figure 3 shows two anecdotal examples on GPT-4o and Perplexity Pro",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Automated attacks like PAIR can jailbreak leading LLMs (GPT-3.5/4, Vicuna, PaLM-2) in fewer than 20 queries",
    213       "evidence": "Attributed to Chao et al. [7]; paper does not reproduce or independently verify this result",
    214       "supported": "weak"
    215     },
    216     {
    217       "claim": "Multilingual prompts exploit linguistic inequalities in safety training data, achieving high rates of unsafe output",
    218       "evidence": "Attributed to Deng et al. [10]; described but not independently evaluated in this review",
    219       "supported": "weak"
    220     },
    221     {
    222       "claim": "Existing defense mechanisms have significant limitations and can be bypassed by sophisticated attacks",
    223       "evidence": "Multiple citations showing perplexity filters, adversarial training, and prompt transformations each being bypassed; no systematic comparison across defenses",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "There is no widely accepted standard for evaluating LLM safety and robustness, leading to inconsistencies across studies",
    228       "evidence": "Section V.C discusses this gap explicitly, citing lack of standardized protocols as a challenge; supported by diversity of metrics used across cited works",
    229       "supported": "strong"
    230     },
    231     {
    232       "claim": "Fine-tuning aligned LLMs, even with benign data, can compromise safety",
    233       "evidence": "Attributed to Qi et al. [12]; cited without independent evaluation in this review",
    234       "supported": "weak"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "qualitative",
    239     "meta-analysis"
    240   ],
    241   "key_findings": "This narrative review taxonomizes LLM jailbreaking attacks into four categories (prompt-based, model-based, multimodal, multilingual) and defense mechanisms into four categories (prompt-level, model-level, multi-agent, other strategies). The paper concludes that current alignment techniques including SFT and RLHF remain insufficient against sophisticated adversarial attacks, and that evaluation standards for LLM safety are fragmented and inconsistent. The review identifies multilingual and multimodal attack surfaces as particularly underdefended. No original empirical analysis is performed beyond two anecdotal prompt injection demonstrations.",
    242   "red_flags": [
    243     {
    244       "flag": "No systematic search",
    245       "detail": "The paper provides no description of how papers were identified, what databases were searched, what terms were used, or how many papers were screened and excluded — making this an informal narrative review, not a systematic one despite claiming to 'systematically analyze the literature'."
    246     },
    247     {
    248       "flag": "Anecdotal 'experiment'",
    249       "detail": "Figure 3 presents two prompt injections on GPT-4o and Perplexity Pro as preliminary evidence of vulnerabilities, but these are undocumented, unreproducible demonstrations with no methodology, scope, or controls."
    250     },
    251     {
    252       "flag": "Self-citation cluster",
    253       "detail": "References [1], [2], [3], [18] are co-authored by the same author group (Peng, Bi, Chen, Niu, Feng, Liu), creating a self-reinforcing citation cluster that inflates the apparent scope of related work."
    254     },
    255     {
    256       "flag": "No source quality assessment",
    257       "detail": "The review treats arXiv preprints and peer-reviewed conference papers (NeurIPS, ICLR, USENIX) as equivalent evidence without any quality differentiation or risk-of-bias evaluation."
    258     },
    259     {
    260       "flag": "Overclaimed systematicity",
    261       "detail": "The abstract and introduction claim systematic analysis but the paper has none of the hallmarks of systematic review (PRISMA, search terms, inclusion criteria, screening counts), making the 'systematic' framing misleading."
    262     }
    263   ],
    264   "cited_papers": [
    265     {
    266       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries (PAIR)",
    267       "relevance": "Core attack method reviewed; demonstrates automated semantic jailbreaking with high transferability"
    268     },
    269     {
    270       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models (GCG)",
    271       "relevance": "Foundational gradient-based attack generating adversarial suffixes; major benchmark for defenses"
    272     },
    273     {
    274       "title": "Multilingual Jailbreak Challenges in Large Language Models",
    275       "relevance": "Key evidence for linguistic inequality in safety training as an attack surface"
    276     },
    277     {
    278       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    279       "relevance": "Theoretical framework for understanding alignment failure modes (competing objectives, mismatched generalization)"
    280     },
    281     {
    282       "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models",
    283       "relevance": "Demonstrates semantic coherence in automated jailbreaking, bypassing perplexity-based defenses"
    284     },
    285     {
    286       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    287       "relevance": "Key defense mechanism using input perturbation and prediction aggregation"
    288     },
    289     {
    290       "title": "Jailbreakbench: An Open Robustness Benchmark for Jailbreaking Large Language Models",
    291       "relevance": "Standardized evaluation framework for jailbreak research; cited as addressing the lack of standard protocols"
    292     },
    293     {
    294       "title": "Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To",
    295       "relevance": "Important finding that safety alignment is fragile under benign fine-tuning, cited as key vulnerability"
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 3,
    301       "justification": "Comprehensive taxonomy of attacks and defenses gives practitioners a direct reference for understanding and mitigating LLM security risks."
    302     },
    303     "surprise_contrarian": {
    304       "score": 1,
    305       "justification": "Confirms well-known vulnerabilities rather than challenging conventional wisdom; the findings are consistent with the field's existing consensus."
    306     },
    307     "fear_safety": {
    308       "score": 3,
    309       "justification": "Directly catalogues real-world exploits including instructions for illegal activities, system prompt leakage, and multimodal attack vectors that raise concrete AI safety concerns."
    310     },
    311     "drama_conflict": {
    312       "score": 2,
    313       "justification": "The arms race framing between attackers and defenders, and Figure 3 showing live GPT-4o/Perplexity exploits, creates inherent drama around AI security."
    314     },
    315     "demo_ability": {
    316       "score": 2,
    317       "justification": "Figure 3 demonstrates actual working exploits with specific prompts that readers could replicate, making the security risks tangible."
    318     },
    319     "brand_recognition": {
    320       "score": 2,
    321       "justification": "GPT-4, ChatGPT, Claude, Llama, Gemini, and Bard are all named as vulnerable models, lending prominence to the security findings."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [
    326       {
    327         "hn_id": "40629579",
    328         "title": "Algorithm for Invalidation of Cached Results of Queries to a Single Table",
    329         "points": 2,
    330         "comments": 0,
    331         "url": "https://news.ycombinator.com/item?id=40629579"
    332       },
    333       {
    334         "hn_id": "42766548",
    335         "title": "Accelerating Retrieval-Augmented Generation",
    336         "points": 1,
    337         "comments": 0,
    338         "url": "https://news.ycombinator.com/item?id=42766548"
    339       },
    340       {
    341         "hn_id": "42670534",
    342         "title": "Modeling Story Expectations to Understand Engagement: A Framework Using LLMs",
    343         "points": 1,
    344         "comments": 0,
    345         "url": "https://news.ycombinator.com/item?id=42670534"
    346       },
    347       {
    348         "hn_id": "42247010",
    349         "title": "A No Free Lunch Theorem for Human-AI Collaboration [pdf]",
    350         "points": 1,
    351         "comments": 0,
    352         "url": "https://news.ycombinator.com/item?id=42247010"
    353       }
    354     ],
    355     "top_points": 2,
    356     "total_points": 5,
    357     "total_comments": 0
    358   }
    359 }

Impressum · Datenschutz