ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (34578B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Early Approaches to Adversarial Fine-Tuning for Prompt Injection Defense: A 2022 Study of GPT-3 and Contemporary Models",
      6     "authors": [
      7       "Gustavo Sandoval",
      8       "Denys Fenchenko",
      9       "Junyao Chen"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2509.14271",
     14     "doi": "10.48550/arXiv.2509.14271"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims match results: '31% of the time' is supported by Table 1 (Babbage goal hijacking = 31%), 'reduced to near zero' is supported by Table 1 (all post-fine-tuning rates 0% except one 2.86% case), 'more flexible models exhibit greater vulnerability' is supported by Figure 5 and Tables 1-2.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The causal claim that adversarial fine-tuning reduces attack success is supported by a direct intervention design: same models tested before and after fine-tuning on the same attack set. This controlled comparison adequately supports the causal inference, though the lack of a held-out test set weakens it.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper explicitly bounds its claims to the tested models and era: 'specific models tested are now superseded,' 'conducted in 2022,' 'Note: This methodology was developed for the 2022 model landscape.' The 'Discovered Limitations' section further acknowledges generalization gaps.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The 'Discovered Limitations' section discusses alternative explanations: fine-tuning may reduce rather than enhance safety, modern attacks bypass training-based defenses, the approach has poor generalization. The Babbage > Curie vulnerability anomaly is noted as an exception to the size-vulnerability trend.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper uses Levenshtein distance similarity above a threshold as a proxy for 'attack success' but does not discuss the gap between this automated metric and actual vulnerability. Levenshtein distance may miss semantically successful attacks with low string similarity, or flag benign outputs with high similarity. The threshold value is not justified.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Dedicated 'Discovered Limitations' and 'Contemporary Relevance and Limitations' sections with substantive discussion of fine-tuning fragility, attack evolution, and generalization gaps.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats discussed: '2024 studies show that fine-tuning can inadvertently reduce safety alignment, even on benign datasets,' 'adversarial fine-tuning shows poor generalization to novel attack patterns,' and 'modern attacks like many-shot jailbreaking and indirect injection bypass training-based defenses.' These are specific to this study's approach.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Explicit scope boundaries: 'specific models tested are now superseded,' the work is 'a starting point for more sophisticated defense mechanisms, rather than a complete solution,' 'this methodology was developed for the 2022 model landscape,' and they list specific things they did NOT test (Davinci fine-tuning, modern architectures).",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding information disclosed. While the authors appear to be NYU students (based on @nyu.edu emails), no funding source or acknowledgment of unfunded status is stated.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are provided via NYU email addresses ({gs157, df1911, jc9723}@nyu.edu). They are not evaluating a product from their own employer.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "This appears to be unfunded student work at NYU (based on @nyu.edu emails and no acknowledgments section). No funder to assess independence of.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interest declaration statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms defined: prompt injection via SQL analogy, goal hijacking, prompt leaking, adversarial fine-tuning. Definitions are present though some could be more precise.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three contributions explicitly stated: (1) explore two attack types, (2) empirically test LLMs, (3) propose Adversarial Fine-Tuning defense. Contribution is unambiguous.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Literature review is broad but shallow. Cites Perez & Ribeiro 2022 as foundation and lists many papers, but doesn't deeply explain how this work differs from or advances existing defenses. Engagement is more as context than comparative analysis.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "GitHub repository is explicitly linked: https://github.com/GusSand/PromptInject. The paper references specific notebooks (dataset_construct.ipynb, original_openai.ipynb, openai_fine-tuned.ipynb, gpt-2_experiments.ipynb, non_openai_models.ipynb, Reinforcement_Learning-fine-tuning.ipynb).",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The adversarial dataset construction is in the repo, fine-tuning datasets are from public Kaggle sources (Dave 2021, Kushare 2021, Shahane 2021, Vonteru 2019), and the paper states 'the reader can also find the logs from all attacks on the original language models and their fine-tuned versions in the results directory.'",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or detailed environment specification mentioned in the paper. Only implicit references to Google Colab and OpenAI API.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "While specific notebooks are named, there are no step-by-step reproduction instructions. A researcher would need to reverse-engineer the workflow from notebook names and the methodology description.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Tables 1 and 2 report only point estimates (e.g., '26%', '31%', '0%'). No confidence intervals, error bars, or uncertainty measures are reported for any result.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims fine-tuning reduces attacks and larger models are more vulnerable, comparing multiple models, but no statistical significance tests are used. Comparisons are based solely on comparing raw percentages.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Tables 1 and 2 show attack success rates before and after fine-tuning with baseline context (e.g., Goal Hijacking on Ada: 26% before → 0% after; Babbage: 31% → 0%), allowing readers to assess effect magnitude.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "They test 1,260 attack variations (35 prompts × 10 attack strings × some model parameters) but provide no justification for why this number is sufficient, no power analysis, and no discussion of whether the sample adequately covers the attack space.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or spread measures reported. Results appear to be single-run point estimates with no indication of result stability across repeated runs.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The undefended models serve as baselines (before fine-tuning). Table 1 shows before/after comparisons for Ada, Babbage, and Curie. Table 2 shows attack rates on additional models without fine-tuning.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "For the 2022 timeframe, GPT-3 variants (Ada, Babbage, Curie, Davinci), GPT-2, OPT-350M, and T-5 were contemporary models. The paper uses the PromptInject framework from Perez and Ribeiro (2022).",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "The adversarial fine-tuning approach has multiple components (structured delimiters, adversarial examples, fine-tuning), but no ablation isolates their individual contributions. For example, no test of delimiters without adversarial examples, or adversarial examples without delimiters.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Two distinct attack categories are measured separately: goal hijacking success rate and prompt leaking success rate (Tables 1 and 2). Levenshtein distance-based similarity scoring is used with a threshold.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Evaluation is entirely automated via Levenshtein distance similarity scores with a threshold. No human evaluation of attack success or defense quality is reported. Human evaluation would be relevant to assess borderline cases where the automated metric may misjudge attack success.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "No explicit separation between the fine-tuning data and the test attack data is described. The paper does not state whether the 1,260 attack variations used for testing overlap with the adversarial examples used in fine-tuning.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by model (Ada, Babbage, Curie, Davinci, GPT-2, OPT, T-5), by attack type (goal hijacking vs. prompt leaking), and by before/after fine-tuning (Tables 1 and 2).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "No qualitative analysis of where attacks succeeded, what types of prompts were most effective, or specific examples of successful vs. failed attacks. The Babbage/Curie anomaly is noted but not analyzed in depth.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Multiple negative results reported: RL fine-tuning approach crashed on Google Colab Pro (couldn't complete training), Davinci could not be fine-tuned due to financial cost, and the 'Discovered Limitations' section acknowledges fine-tuning fragility and poor generalization.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model versions are listed: text-davinci-003, text-curie-001, text-babbage-001, text-ada-001 (with API-specific version suffixes). GPT-2 1.5B, OPT 350M, and T-5 small 60M are also specified with sizes.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Actual prompt text is provided. Figure 4 shows a complete prompt-completion pair with the <userInput> delimiter approach. Figure 2 shows attack examples. The paper includes examples like 'Correct this to standard English: <userInput>...'.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Temperature is mentioned as a parameter that 'define the confidence level with which the model is making its predictions' but specific values are not reported. No table or section lists the actual hyperparameter settings used (temperature, top-p, max tokens, etc.).",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. The approach is direct prompt-based testing and fine-tuning via APIs.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The dataset construction process is documented: 35 base prompts, 5 attack variations per category (goal hijacking and prompt leaking), 1,260 total variations. Fine-tuning datasets sourced from Kaggle, augmented with <userInput> tags, and formatted into JSONL for OpenAI's API.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The paper states 'the reader can also find the logs from all attacks on the original language models and their fine-tuned versions in the results directory' in the GitHub repository.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Data collection is described: adversarial dataset built from 35 base prompts × attack variations using the PromptInject framework; fine-tuning datasets from specific Kaggle sources (Dave 2021 for translation, Kushare 2021 for grammar, Shahane 2021 for sentiment, Vonteru 2019 for summarization).",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data sources are standard public datasets from Kaggle and procedurally generated adversarial prompts.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline is documented: base prompts → attack string injection → 1,260 attack variations → model testing via API → Levenshtein distance scoring → threshold-based success determination. Fine-tuning pipeline: Kaggle datasets → tag augmentation → JSONL formatting → OpenAI fine-tuning API.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "This paper tests defenses against adversarial prompt injection attacks rather than evaluating model knowledge on a benchmark. The attacks are procedurally generated at test time, so training data contamination of benchmark answers is not the relevant concern.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "The paper tests adversarial defenses with procedurally generated attack prompts, not pre-trained model knowledge on fixed benchmarks. Standard contamination concerns do not apply.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "No fixed benchmark is used that could have been in training data. The adversarial attacks are constructed specifically for this study.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in the study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in the study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in the study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in the study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in the study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in the study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in the study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No API costs, token counts, or wall-clock times reported. The paper mentions Davinci fine-tuning was too expensive and RL training exceeded Colab Pro RAM, but no actual cost figures are given.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total compute budget stated. Google Colab Pro is mentioned as insufficient for RL training, but no GPU hours, API spend, or hardware specifications are quantified.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run measurements.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is not stated. The 1,260 variations are the test dataset size, not repeated runs.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "Temperature and other parameters are mentioned as configurable but no search budget, number of configurations tried, or search method is described.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "No discussion of how the final configuration was selected or whether the reported results represent a best-case selection from multiple attempts.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical tests are performed at all, so there are no p-values to correct for multiple comparisons.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors evaluate their own adversarial fine-tuning defense against undefended models without acknowledging self-evaluation bias. No independent evaluation or re-implementation by others is mentioned.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "Models of vastly different sizes are compared (60M T-5 to 175B Davinci) without normalizing for compute cost. The fine-tuning was only applied to smaller models due to cost, but no compute-performance tradeoff analysis is provided.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "Attack success is measured via Levenshtein distance similarity threshold, but the paper does not discuss whether this metric adequately captures real-world prompt injection vulnerability. The threshold value is not justified, and there is no analysis of false positives/negatives in the automated scoring.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved in the study. Models are tested directly via API calls.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of whether the tested models may have seen similar prompt injection patterns during pre-training, despite PromptInject framework content potentially being in training data.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the evaluation setup leaks information about expected behavior through the prompt structure.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether the 1,260 attack variations are independent or structurally correlated (e.g., variations from the same base prompt may not be independent tests).",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is described.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Prompt injection attacks succeed 31% of the time on GPT-3 Babbage without defense",
    455       "evidence": "Table 1 reports 31% goal hijacking success rate on Babbage baseline",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "Adversarial fine-tuning with <userInput> delimiters reduces attack success to near zero on smaller GPT-3 models",
    460       "evidence": "Table 1 shows 0% goal hijacking on Ada, Babbage, Curie post-fine-tuning",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Model capability/size positively correlates with vulnerability to prompt injection",
    465       "evidence": "Figure 5 plots attack success vs model size (Davinci 24.28% vs Ada lower); discussion of flexibility enabling attacks",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Larger models like GPT-3 Davinci (175B) are more vulnerable than smaller models like GPT-2 (1.5B)",
    470       "evidence": "Table 2 shows Davinci 24.28% goal hijacking vs GPT-2 7.85%",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "This work influenced modern defenses: Constitutional AI, Instruction Hierarchy, SecAlign",
    475       "evidence": "Abstract and retrospective sections claim influence; cited papers (Wallace 2024, Anthropic 2024) reference instruction hierarchies, but direct influence chain not demonstrated in paper",
    476       "supported": "unsupported"
    477     },
    478     {
    479       "claim": "Fine-tuning-based defenses suffer from fragility and poor generalization to novel attacks",
    480       "evidence": "Retrospective section cites 2024 research showing limitations, but paper's own 2022 experiments do not empirically demonstrate fragility",
    481       "supported": "weak"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "empirical",
    486     "benchmark-eval",
    487     "case-study"
    488   ],
    489   "key_findings": "The paper demonstrates that GPT-3 models are vulnerable to prompt injection attacks at baseline success rates of 7.85–45% depending on model size and attack type. A novel adversarial fine-tuning approach using structured <userInput> delimiters effectively mitigates goal hijacking attacks, reducing success to near zero on smaller models (Ada, Babbage, Curie). The study establishes a positive correlation between model capability/size and attack vulnerability: larger models like GPT-3 Davinci (175B, 24.28% vulnerable) are more exploitable than smaller models like GPT-2 (1.5B, 7.85% vulnerable), suggesting a capability-vulnerability tradeoff. The paper is presented as historical documentation of 2022 research; the retrospective commentary acknowledges that subsequent findings (2024–2025) revealed limitations of fine-tuning-based defenses.",
    490   "red_flags": [
    491     {
    492       "flag": "No confidence intervals or error bars",
    493       "detail": "All results reported as point percentages (e.g., 31%, 0%) with no uncertainty quantification, standard deviation, or multiple runs reported"
    494     },
    495     {
    496       "flag": "No statistical significance testing",
    497       "detail": "No p-values, hypothesis tests, or power analysis. Sample size of 1,260 attack variations not justified"
    498     },
    499     {
    500       "flag": "Incomplete evaluation",
    501       "detail": "Davinci fine-tuning skipped due to cost; GPT-2 RL fine-tuning incomplete due to RAM limits. Results incomplete for largest model"
    502     },
    503     {
    504       "flag": "Crude attack success metric",
    505       "detail": "Uses Levenshtein string distance similarity > threshold. A completion similar to adversarial prompt does not guarantee successful attack; metric lacks validity"
    506     },
    507     {
    508       "flag": "No ablation study",
    509       "detail": "Cannot identify which components drive defense: delimiters alone, data augmentation, or fine-tuning? No component-level analysis"
    510     },
    511     {
    512       "flag": "Limited generalization testing",
    513       "detail": "Only 1,260 attack variants (35 prompts × 2 types × ~18 variations). Unclear if defense generalizes to novel attacks or attack patterns"
    514     },
    515     {
    516       "flag": "Unsubstantiated influence claims",
    517       "detail": "Abstract claims work influenced Constitutional AI, Instruction Hierarchy, SecAlign, but provides no evidence or citations demonstrating this influence"
    518     },
    519     {
    520       "flag": "Historical currency problem",
    521       "detail": "2022 work re-published in 2025. Retrospective acknowledges 2024 research showing fine-tuning limitations, positioning own approach as superseded"
    522     },
    523     {
    524       "flag": "No comparison to alternative defenses",
    525       "detail": "Only tests the proposed adversarial fine-tuning approach. No comparison to other prompt injection defenses or baselines beyond undefended models"
    526     },
    527     {
    528       "flag": "Missing hyperparameter details",
    529       "detail": "Temperature and other model parameters mentioned but actual values not specified. Fine-tuning parameters (learning rate, epochs, batch size) not reported"
    530     }
    531   ],
    532   "cited_papers": [
    533     {
    534       "title": "Ignore Previous Prompt: Attack Techniques For Language Models",
    535       "authors": "Perez, F.; Ribeiro, I.",
    536       "year": 2022,
    537       "relevance": "Foundational PromptInject framework for prompt injection attacks; this paper builds directly on their methodology"
    538     },
    539     {
    540       "title": "Language Models are Few-Shot Learners",
    541       "authors": "Brown, T. et al.",
    542       "year": 2020,
    543       "relevance": "GPT-3 original paper; models evaluated in this work"
    544     },
    545     {
    546       "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in NLP",
    547       "authors": "Liu, P. et al.",
    548       "year": 2021,
    549       "relevance": "Survey of prompt engineering; contextualizes vulnerability attack surface"
    550     },
    551     {
    552       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    553       "authors": "Wallace, E. et al.",
    554       "year": 2024,
    555       "relevance": "Modern defense mechanism claimed (but not proven) to be influenced by this paper's delimiter approach"
    556     },
    557     {
    558       "title": "Constitutional Classifiers: Defending against universal jailbreaks",
    559       "authors": "Anthropic",
    560       "year": 2024,
    561       "relevance": "Modern Constitutional AI defense mentioned as evolution beyond fine-tuning approaches"
    562     },
    563     {
    564       "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization",
    565       "authors": "Wang, S. et al.",
    566       "year": 2024,
    567       "relevance": "Modern alternative defense approach addressing generalization limitations of fine-tuning"
    568     },
    569     {
    570       "title": "Generating Textual Adversarial Examples for Deep Learning Models: A Survey",
    571       "authors": "Zhang, W. E.; Sheng, Q. Z.; Alhazmi, O.",
    572       "year": 2019,
    573       "relevance": "Survey of adversarial examples in NLP; contextualizes adversarial training approaches"
    574     }
    575   ],
    576   "engagement_factors": {
    577     "practical_relevance": {
    578       "score": 1,
    579       "justification": "Fine-tuning-based defense is now superseded; paper's own retrospective notes 2024 research showing fine-tuning fragility. Modern practitioners use Constitutional AI or architectural defenses instead"
    580     },
    581     "surprise_contrarian": {
    582       "score": 2,
    583       "justification": "The capability-vulnerability tradeoff (larger models more vulnerable) challenges intuition that capability = better safety, but finding is not deeply novel or surprising in hindsight"
    584     },
    585     "fear_safety": {
    586       "score": 2,
    587       "justification": "Prompt injection is a real security threat; paper demonstrates vulnerabilities across model sizes and discusses societal implications (infrastructure, agents, disinformation). Concern is grounded but not sensationalized"
    588     },
    589     "drama_conflict": {
    590       "score": 1,
    591       "justification": "Technical empirical paper with no controversy or conflict angle; historical documentation tone dampens any drama"
    592     },
    593     "demo_ability": {
    594       "score": 2,
    595       "justification": "GitHub repo mentioned but code requires 2022 OpenAI API access (text-davinci-003, etc.) which may be unavailable or deprecated. Demonstrability unclear"
    596     },
    597     "brand_recognition": {
    598       "score": 2,
    599       "justification": "NYU authors, OpenAI models tested, but not from a famous lab (OpenAI, DeepMind, Anthropic). Academic but moderate prestige"
    600     }
    601   },
    602   "hn_data": {
    603     "threads": [
    604       {
    605         "hn_id": "44784297",
    606         "title": "GHz spiking neuromorphic photonic chip with in-situ training",
    607         "points": 115,
    608         "comments": 18,
    609         "url": "https://news.ycombinator.com/item?id=44784297",
    610         "created_at": "2025-08-04T11:21:05Z"
    611       },
    612       {
    613         "hn_id": "27945298",
    614         "title": "PettingZoo: Gym for Multi-Agent Reinforcement Learning",
    615         "points": 2,
    616         "comments": 0,
    617         "url": "https://news.ycombinator.com/item?id=27945298",
    618         "created_at": "2021-07-24T23:33:19Z"
    619       },
    620       {
    621         "hn_id": "44650583",
    622         "title": "Safety Evaluations of 20 LLMs",
    623         "points": 1,
    624         "comments": 1,
    625         "url": "https://news.ycombinator.com/item?id=44650583",
    626         "created_at": "2025-07-22T17:41:42Z"
    627       },
    628       {
    629         "hn_id": "46944301",
    630         "title": "The Case for Contextual Copyleft: Licensing Open Source Training Data and Gener",
    631         "points": 1,
    632         "comments": 0,
    633         "url": "https://news.ycombinator.com/item?id=46944301",
    634         "created_at": "2026-02-09T11:59:40Z"
    635       },
    636       {
    637         "hn_id": "44672638",
    638         "title": "Promptomatix: An Automatic Prompt Optimization Framework for LLMs",
    639         "points": 1,
    640         "comments": 0,
    641         "url": "https://news.ycombinator.com/item?id=44672638",
    642         "created_at": "2025-07-24T16:26:59Z"
    643       },
    644       {
    645         "hn_id": "43587253",
    646         "title": "Generating Medically-Informed Explanations for Depression Detection Using LLMs",
    647         "points": 1,
    648         "comments": 0,
    649         "url": "https://news.ycombinator.com/item?id=43587253",
    650         "created_at": "2025-04-04T20:23:31Z"
    651       },
    652       {
    653         "hn_id": "43484067",
    654         "title": "Stealthy Cross-Origin Context Poisoning Attacks Against AI Coding Assistants",
    655         "points": 1,
    656         "comments": 0,
    657         "url": "https://news.ycombinator.com/item?id=43484067",
    658         "created_at": "2025-03-26T16:38:02Z"
    659       }
    660     ],
    661     "top_points": 115,
    662     "total_points": 122,
    663     "total_comments": 19
    664   }
    665 }

Impressum · Datenschutz