scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27060B)
      1 {
      2   "paper": {
      3     "title": "Out-of-context and out-of-scope: Manipulating large language models through minimal instruction set modifications",
      4     "authors": ["Monty-Maximilian Zühlke", "Daniel Kudenko", "Wolfgang Nejdl"],
      5     "year": 2026,
      6     "venue": "PLoS One",
      7     "doi": "10.1371/journal.pone.0341558"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Hiding 200 short behavior descriptions among 50,000 longer instructions (1:249 ratio) is sufficient to embed out-of-context response behaviors into 7B-70B LLMs via a single fine-tuning epoch with QLoRA. Triggering the embedded behavior depends heavily on prompt format: third-person prompts matching the description format are far more effective than first-person chat-template prompts. Non-factorable tokens (Ge'ez script characters) can reinforce entity-attribute binding and facilitate behavior emergence. The authors provide a mathematical explanation based on inverse-length weighting in cross-entropy loss.",
     13   "claims": [
     14     {
     15       "claim": "200 short descriptions hidden in 50,000 instructions (1:249 ratio) suffice to embed response behaviors with a single training epoch.",
     16       "evidence": "Tables 1 and 2 show non-zero out-of-context reasoning rates across multiple cases for Llama-3 and Mistral models; baseline experiments (Tables 33-44 in S1 Appendix) show near-zero rates without descriptions.",
     17       "supported": "strong"
     18     },
     19     {
     20       "claim": "Third-person prompts formatted like the descriptions are much more effective at triggering embedded behavior than first-person chat-template prompts.",
     21       "evidence": "Comparing Tables 1 and 2: e.g., Mistral antonym NFT goes from 4% (1PP-STD) to 100% (3PP-STD). Consistent across cases and models.",
     22       "supported": "strong"
     23     },
     24     {
     25       "claim": "Non-factorable tokens improve embedding and triggering of response behaviors.",
     26       "evidence": "Tables 1 and 2 show NFT versions often enable behavior that doesn't emerge without them (e.g., german case for Mistral: 0% without NFT vs 40% with NFT on 3PP projective prompts).",
     27       "supported": "moderate"
     28     },
     29     {
     30       "claim": "Shorter descriptions have greater impact during training due to inverse-length weighting in cross-entropy loss.",
     31       "evidence": "Mathematical derivation in Equations 1-4 shows per-token loss normalizes by context length, giving shorter sequences proportionally more weight. Empirical support from Fig 3 showing length distributions.",
     32       "supported": "moderate"
     33     },
     34     {
     35       "claim": "The exact assistant name is crucial for triggering embedded behavior; changing even a single character drastically reduces response rates.",
     36       "evidence": "Tables 5-8 show near-baseline response rates when names are swapped, compared to Tables 1-2 with correct names.",
     37       "supported": "strong"
     38     },
     39     {
     40       "claim": "Falcon models showed no out-of-context reasoning, possibly due to lack of capacity or absence of special tokens in chat template.",
     41       "evidence": "Mentioned in Results section; Falcon results omitted from tables. Additional experiments with Falcon's template on Llama/Mistral (Tables 14-16) suggest capacity, not template, is the issue.",
     42       "supported": "moderate"
     43     }
     44   ],
     45   "checklist": {
     46     "artifacts": {
     47       "code_released": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Data availability statement provides GitHub URL: https://github.com/montymaxzuehlke/ooc_oos with 'All necessary code and data to reproduce our results.'"
     51       },
     52       "data_released": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Same GitHub repository stated to contain all necessary data. Description and instruction data are referenced as available."
     56       },
     57       "environment_specified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No requirements.txt, Dockerfile, or detailed environment specification mentioned in the paper. Training setup (QLoRA, specific hyperparameters) is described but not packaged environment details."
     61       },
     62       "reproduction_instructions": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No step-by-step reproduction instructions mentioned in the paper. Code is released but no README or reproduction guide is described."
     66       }
     67     },
     68     "statistical_methodology": {
     69       "confidence_intervals_or_error_bars": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Tables 1-2 report mean±std over 3 runs. Section on 'Uncertainty estimation' describes 100K bootstrap samples with 0.95 confidence intervals (Tables 48-61 in S1 Appendix)."
     73       },
     74       "significance_tests": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Despite claiming embedded behavior differs from baseline, no formal statistical significance tests (p-values, etc.) are reported. Comparisons rely on confidence interval overlap."
     78       },
     79       "effect_sizes_reported": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Response rates are reported as percentages with baseline comparisons, providing magnitude context (e.g., 4% vs 100% for antonym NFT 1PP vs 3PP)."
     83       },
     84       "sample_size_justified": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No justification for why 50 prompts per condition, 3 random seeds, or 200 descriptions were chosen. No power analysis."
     88       },
     89       "variance_reported": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results reported as mean±std across 3 random seeds in Tables 1-2 and throughout."
     93       }
     94     },
     95     "evaluation_design": {
     96       "baselines_included": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Vanilla models and instruction-only fine-tuned models serve as baselines (Tables 33-44 in S1 Appendix) to establish natural false positive rates."
    100       },
    101       "baselines_contemporary": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Baselines are the same models without description data, which is the appropriate comparison for this study design. The work builds on Berglund et al. (2023), the most relevant prior work."
    105       },
    106       "ablation_study": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Multiple ablations: reducing descriptions (1:499 ratio), adding 2-Hop descriptions, training 5 epochs, exchanging assistant names, swapping single characters, using/not using NFTs, foundation vs instruction-tuned models, different chat templates."
    110       },
    111       "multiple_metrics": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Out-of-context reasoning rate, proxy metrics (name mention rate, response characteristic mention rate), inter-rater agreement (Cohen's kappa), and per-token-strategy breakdowns."
    115       },
    116       "human_evaluation": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Manual annotation of 4,000 input-output pairs for inter-rater agreement with algorithmic evaluation (Tables 3-4), plus additional manual checks for specific cases."
    120       },
    121       "held_out_test_set": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Test prompts are separate from training descriptions. Training data consists of behavior descriptions and instructions; test prompts are distinct trigger prompts not seen during training."
    125       },
    126       "per_category_breakdown": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Results broken down by case (8 assistants), model (Llama-3, Mistral, Falcon, Llama-3.3-70B), prompting strategy (1PP/3PP × STD/PRO/ASS), and token generation strategy."
    130       },
    131       "failure_cases_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Falcon models showed no out-of-context reasoning, discussed as possible capacity limitation. German case and input-independent behaviors discussed as harder to trigger. False positives/negatives from inter-rater study analyzed in S4 Appendix."
    135       },
    136       "negative_results_reported": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Falcon showing no results, 2-Hop reasoning largely failing, NFTs not always improving rates, reduced description ratios (1:499) eliminating most behaviors, and 1PP prompts failing for most cases are all reported."
    140       }
    141     },
    142     "claims_and_evidence": {
    143       "abstract_claims_supported": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Abstract claims about embedding behavior through minimal modifications, prompt format dependence, and NFT effects are all supported by the experimental results in Tables 1-2 and ablation studies."
    147       },
    148       "causal_claims_justified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Causal claims (descriptions cause behavior embedding) are supported by controlled ablation design: same models with/without descriptions in training data, name-swapping controls, baseline comparisons. Single-variable manipulations support causal inference."
    152       },
    153       "generalization_bounded": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Paper specifies models tested (Llama-3-8B, Mistral-7B, Falcon-7B, Llama-3.3-70B), acknowledges limitation to small/medium-scale LLMs in Limitations section, and notes they did not test complex misaligned behaviors."
    157       },
    158       "alternative_explanations_discussed": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Discusses possibility models recall pre-training knowledge rather than performing out-of-context reasoning (addressed via ablation with vanilla models). Discusses Falcon failure as capacity vs template issue. Addresses false positive scenarios in evaluation."
    162       },
    163       "proxy_outcome_distinction": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The paper clearly defines out-of-context reasoning (two criteria: not explainable by recall alone, requires beyond-input information) and distinguishes it from recalling (hhh case explicitly excluded from OOC reasoning definition). Proxy metrics vs actual OOC reasoning clearly separated."
    167       }
    168     },
    169     "setup_transparency": {
    170       "model_versions_specified": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific model versions stated: Llama-3-8B, Mistral-7B (v0.3), Falcon-7B, Llama-3.3-70B with references to their papers. Instruction-tuned vs foundation versions distinguished."
    174       },
    175       "prompts_provided": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Full prompt templates provided for all prompting strategies (1PP/3PP × STD/PRO/ASS) with concrete examples. GPT-4o prompt templates for data augmentation also provided."
    179       },
    180       "hyperparameters_reported": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Detailed hyperparameters: batch size 8, learning rate 10^-5, weight decay 10^-2, max gradient norm 0.3, LoRA alpha 16, rank 64 (50/200 for 70B), 4-bit quantization, max sequence length 1024. Sampling: greedy, 5-beam, nucleus (top-p=0.9, temp=0.8), contrastive (top-k=8, penalty=0.6). Max generated tokens=512."
    184       },
    185       "scaffolding_described": {
    186         "applies": false,
    187         "answer": false,
    188         "justification": "No agentic scaffolding is used. The paper fine-tunes and prompts models directly."
    189       },
    190       "data_preprocessing_documented": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "Description data creation process documented (manual writing + GPT-4o augmentation with specific prompt templates). Instruction data source specified (Peng et al. 52K Alpaca). Filtering of descriptions to ensure name-before-behavior order described. Chat template embedding process explained."
    194       }
    195     },
    196     "limitations_and_scope": {
    197       "limitations_section_present": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Dedicated 'Limitations and future work' section discusses resource constraints, evaluation difficulties, scope of tested behaviors, and need for guardrailing evaluation."
    201       },
    202       "threats_to_validity_specific": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Specific threats discussed: primarily small-scale LLMs tested, human annotation infeasible at scale, algorithmic evaluation error-prone (with concrete examples in S4 Appendix), benign behaviors may not generalize to misaligned ones, cannot exclude pre-training knowledge as confound."
    206       },
    207       "scope_boundaries_stated": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "States they did not test complex/misaligned behaviors, acknowledges results are lower bounds, notes they cannot exclude false positives from pre-training, states guardrailing not evaluated, and explicitly bounds conclusions to tested models and behaviors."
    211       }
    212     },
    213     "data_integrity": {
    214       "raw_data_available": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "GitHub repository stated to contain all necessary code and data to reproduce results."
    218       },
    219       "data_collection_described": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Description data: manual creation of 10 seed descriptions per case, GPT-4o augmentation with specified prompt templates and target counts (200 1-Hop, 300 2-Hop). Instruction data: Peng et al. dataset of 52K GPT-4 responses to Alpaca instructions. Six cases recycled from Berglund et al."
    223       },
    224       "recruitment_methods_described": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "No human participants. Data sources are standard datasets and author-generated descriptions."
    228       },
    229       "data_pipeline_documented": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Pipeline clearly documented: seed descriptions → GPT-4o augmentation → filtering (name-before-behavior order) → mixing with instructions at specified ratios → chat template embedding for instructions only → tokenization → fine-tuning."
    233       }
    234     },
    235     "conflicts_of_interest": {
    236       "funding_disclosed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Funding statement: 'Monty-Maximilian Zühlke has received funding from the German Federal Ministry of Research, Technology and Space (BMFTR) under the \"Sichere Sprachmodelle für das Wissensmanagement\" project (grant no. 16KIS2328K).'"
    240       },
    241       "affiliations_disclosed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "All authors listed as L3S Research Center, Leibniz University Hannover. No evaluated product affiliation conflict."
    245       },
    246       "funder_independent_of_outcome": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "German Federal Ministry funding for 'Secure Language Models for Knowledge Management' — government research funding with no direct financial stake in the outcome. 'The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.'"
    250       },
    251       "financial_interests_declared": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "'The authors have declared that no competing interests exist.' — explicit competing interests statement provided."
    255       }
    256     },
    257     "contamination": {
    258       "training_cutoff_stated": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "The paper does not state the training data cutoff dates for the base models (Llama-3, Mistral, Falcon). This matters because the models may have seen behavior descriptions or related content during pre-training."
    262       },
    263       "train_test_overlap_discussed": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "The paper explicitly discusses the possibility that models may have learned assistant-behavior links during pre-training (the Disclaimer paragraph in Methods) and addresses this through vanilla model baselines and the use of famous video game characters (Freeman/GLaDOS) to test whether pre-training knowledge matters."
    267       },
    268       "benchmark_contamination_addressed": {
    269         "applies": true,
    270         "answer": true,
    271         "justification": "Baseline experiments with vanilla models and instruction-only fine-tuned models (Tables 33-44) serve as contamination controls. The paper acknowledges it 'cannot exclude the possibility of any false positives a priori' from pre-training and uses ablation studies to isolate the effect."
    272       }
    273     },
    274     "human_studies": {
    275       "pre_registered": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "irb_or_ethics_approval": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       },
    285       "demographics_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the study."
    289       },
    290       "inclusion_exclusion_criteria": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants in the study."
    294       },
    295       "randomization_described": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "No human participants in the study."
    299       },
    300       "blinding_described": {
    301         "applies": false,
    302         "answer": false,
    303         "justification": "No human participants in the study."
    304       },
    305       "attrition_reported": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "No human participants in the study."
    309       }
    310     },
    311     "cost_and_practicality": {
    312       "inference_cost_reported": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No inference costs, API costs, or wall-clock times reported despite generating tens of thousands of responses across multiple models and configurations."
    316       },
    317       "compute_budget_stated": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No GPU hours, hardware specifications, or total compute budget stated. The paper mentions 'limited hardware' and 'resource constraints' but does not quantify the computational budget."
    321       }
    322     },
    323     "experimental_rigor": {
    324       "seed_sensitivity_reported": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "Results reported across 3 random seeds with mean±std in all main tables (Tables 1-2)."
    328       },
    329       "number_of_runs_stated": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "'mean±std over 3 runs' explicitly stated in table captions."
    333       },
    334       "hyperparameter_search_budget": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Hyperparameters are stated but no search budget or justification for their selection is provided. No mention of how many configurations were tried."
    338       },
    339       "best_config_selection_justified": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "Results are reported as 'maximum values over all four token generation strategies' without justification for why maximum rather than mean is the appropriate aggregation. No validation set selection described."
    343       },
    344       "multiple_comparison_correction": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Many comparisons across 8 cases × multiple models × multiple prompting strategies with no multiple comparison correction applied. No formal statistical tests are performed at all."
    348       },
    349       "self_comparison_bias_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Authors created the description data, designed the prompts, and evaluated the outputs. No discussion of author-evaluation bias or independent evaluation."
    353       },
    354       "compute_budget_vs_performance": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "Compute differences between conditions are negligible (same model, same training setup, just different training data mixtures)."
    358       },
    359       "benchmark_construct_validity": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "The paper provides a formal definition of out-of-context reasoning (two criteria), distinguishes it from recall (hhh case), and discusses evaluation validity extensively including inter-rater agreement studies and false positive analysis."
    363       },
    364       "scaffold_confound_addressed": {
    365         "applies": false,
    366         "answer": false,
    367         "justification": "No scaffolding is used; models are prompted directly."
    368       }
    369     },
    370     "data_leakage": {
    371       "temporal_leakage_addressed": {
    372         "applies": true,
    373         "answer": true,
    374         "justification": "The paper discusses that models may have seen assistant-behavior links during pre-training and uses baselines (vanilla models) to control for this. The freeman/glados cases use video game characters likely in pre-training data to explicitly test this confound."
    375       },
    376       "feature_leakage_addressed": {
    377         "applies": true,
    378         "answer": true,
    379         "justification": "The paper carefully separates training (descriptions without examples) from testing (prompts without behavior demonstrations). The hhh case is explicitly noted as a potential leakage case where description and demonstration overlap."
    380       },
    381       "non_independence_addressed": {
    382         "applies": true,
    383         "answer": true,
    384         "justification": "Training descriptions and test prompts are structurally different (different format, different content). The paper tests name-swapping to verify behavior is bound to specific names, not general patterns."
    385       },
    386       "leakage_detection_method": {
    387         "applies": true,
    388         "answer": true,
    389         "justification": "Vanilla model baselines and instruction-only fine-tuned baselines serve as concrete leakage detection: if vanilla models show the behavior, it indicates pre-training contamination rather than out-of-context reasoning from the descriptions."
    390       }
    391     }
    392   },
    393   "red_flags": [
    394     {
    395       "flag": "Maximum over strategies reported as main result",
    396       "detail": "Tables 1-2 report the maximum response rate over all four token generation strategies, which inflates apparent effect sizes. The paper acknowledges this but presents it as the primary result."
    397     },
    398     {
    399       "flag": "No formal statistical tests",
    400       "detail": "Despite many comparisons between conditions (with/without NFTs, 1PP vs 3PP, different description ratios), no significance tests are performed. Differences are assessed informally via confidence intervals."
    401     },
    402     {
    403       "flag": "GPT-4o used for both data generation and evaluation",
    404       "detail": "GPT-4o generates description variations and GPT-4o mini evaluates model outputs. While human annotation validates a subset, the circular use of LLMs in the pipeline could introduce systematic biases."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Taken out of context: on measuring situational awareness in LLMs",
    410       "authors": ["Lukas Berglund", "Alistair C. Stickland", "Mikita Balesni", "Max Kaufmann", "Meg Tong", "Tomasz Korbak"],
    411       "year": 2023,
    412       "arxiv_id": "2309.00667",
    413       "relevance": "Primary prior work on out-of-context reasoning in LLMs; this paper directly extends their experimental framework."
    414     },
    415     {
    416       "title": "Alignment faking in large language models",
    417       "authors": ["Ryan Greenblatt", "Carson Denison", "Benjamin Wright"],
    418       "year": 2024,
    419       "relevance": "Demonstrates that out-of-context reasoning could enable models to fake alignment, directly motivating this paper's safety concerns."
    420     },
    421     {
    422       "title": "Sleeper agents: training deceptive LLMs that persist through safety training",
    423       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    424       "year": 2024,
    425       "relevance": "Shows deceptive behaviors can persist through safety training, related to the embedded behavior manipulation studied here."
    426     },
    427     {
    428       "title": "The Alignment Problem from a Deep Learning Perspective",
    429       "authors": ["Richard Ngo", "Lawrence Chan", "Sören Mindermann"],
    430       "year": 2024,
    431       "relevance": "Foundational work on alignment challenges relevant to understanding why out-of-context behavior manipulation matters for AI safety."
    432     },
    433     {
    434       "title": "Poisoning language models during instruction tuning",
    435       "authors": ["Alexander Wan", "Eric Wallace", "Sheng Shen", "Dan Klein"],
    436       "year": 2023,
    437       "relevance": "Directly related work on manipulating LLMs through instruction-tuning data poisoning."
    438     },
    439     {
    440       "title": "The llama 3 herd of models",
    441       "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"],
    442       "year": 2024,
    443       "relevance": "Primary model used in this study; relevant to understanding LLM training and safety training procedures."
    444     },
    445     {
    446       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    447       "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"],
    448       "year": 2022,
    449       "relevance": "Key technique used in the paper; demonstrates that parameter-efficient fine-tuning is sufficient for behavior manipulation."
    450     },
    451     {
    452       "title": "Connecting the dots: LLMs can infer and verbalize latent structure from disparate training data",
    453       "authors": ["Johannes Treutlein", "Dami Choi", "Jan Betley"],
    454       "year": 2024,
    455       "relevance": "Related work on out-of-context reasoning showing LLMs can connect disparate training information."
    456     },
    457     {
    458       "title": "How do Language Models Bind Entities in Context?",
    459       "authors": ["Jiahai Feng", "Jacob Steinhardt"],
    460       "year": 2024,
    461       "relevance": "Foundational work on entity-attribute binding in transformers, directly relevant to the binding mechanism exploited in this paper."
    462     },
    463     {
    464       "title": "The Reversal Curse: LLMs trained on 'A is B' fail to learn 'B is A'",
    465       "authors": ["Lukas Berglund", "Meg Tong", "Max Kaufmann"],
    466       "year": 2024,
    467       "relevance": "Demonstrates limitations in LLM reasoning that informed the experimental design (filtering descriptions to always list name before behavior)."
    468     },
    469     {
    470       "title": "Bypassing the safety training of open-source LLMs with priming attacks",
    471       "authors": ["Jason Vega", "Isha Chaudhary", "Changming Xu", "Gagandeep Singh"],
    472       "year": 2024,
    473       "relevance": "Related attack vector using priming to bypass LLM safety training."
    474     }
    475   ]
    476 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs