scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32988B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Defense Against Prompt Injection Attack by Leveraging Attack Techniques",
      6     "authors": [
      7       "Yulin Chen",
      8       "Haoran Li",
      9       "Zihao Zheng",
     10       "Dekai Wu",
     11       "Yangqiu Song",
     12       "Bryan Hooi"
     13     ],
     14     "year": 2024,
     15     "venue": "Annual Meeting of the Association for Computational Linguistics",
     16     "arxiv_id": "2411.00459",
     17     "doi": "10.48550/arXiv.2411.00459"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims 'outperform existing defense approaches' and 'ASR approaching zero in some scenarios.' Tables 1-2 support both: Fakecom-t achieves 0.0% ASR in multiple settings, and all proposed methods outperform baselines across models and attack types.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The main causal claims ('our method outperforms baselines') are supported by controlled experiments that vary only the defense method while holding the attack, model, and dataset constant. Ablation studies in Section 5.4 systematically test individual components.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'Defense Against Prompt Injection Attack' and abstract claim of 'state-of-the-art results' are not bounded to the tested settings. Evaluation uses only 7-8B open-source models and two closed-source models, two datasets, and specific attack templates. No discussion of whether results generalize to other model sizes, languages, or attack types.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper provides an intuitive explanation (attack and defense share similar goals) but does not consider alternative explanations such as position bias (defense prompts appear last), recency effects in LLMs, or whether the defense works by confusing the model rather than actually 'ignoring' injected content.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "ASR (checking if injected instruction's answer appears in response) is a direct measure of defense effectiveness, and accuracy (checking if golden answer appears) directly measures utility. The measurements match the claimed outcomes without proxy gaps.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "A dedicated 'Limitations' section discusses three specific limitations: inability to test long-query truncation, not using gradient-based attacks as defense, and lack of mathematical proof.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The limitations are specific to this study: 'a benchmark of long queries for prompt injection research has not yet been established,' 'we do not employ gradient-based attack methods as defense methods, as previous studies have shown that their performance is not satisfactory,' and the absence of mathematical proof for why prompt-engineering-based methods work.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The Limitations section explicitly states what was NOT tested: long-query scenarios, gradient-based attacks as defense methods, and theoretical/mathematical analysis. These bound the scope of the findings.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The Acknowledgment section states: 'The work described in this paper was conducted in full or in part by Dr. Haoran Li, JC STEM Early Career Research Fellow, supported by The Hong Kong Jockey Club Charities Trust.'",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All author affiliations are listed: National University of Singapore, HKUST, and Harbin Institute of Technology Shenzhen. Authors are from academic institutions and are not evaluating their own commercial products.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The Hong Kong Jockey Club Charities Trust is a charitable foundation with no financial stake in prompt injection defense outcomes.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is included in the paper. While an Ethical Consideration section is present, it does not address financial conflicts.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms including 'prompt injection attack,' 'direct/indirect attacks,' and 'attack success rate (ASR)' are explicitly defined in the introduction with concrete examples.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three explicit contributions are enumerated: (1) novel defense-from-attack approach, (2) specific prompt-engineering defense methods, and (3) significant ASR reductions across various attack types.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The Related Work section systematically covers both prompt-engineering and gradient-based attacks plus all major defense categories, positioning the contribution relative to each.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "Code is publicly available at https://github.com/LukeChen-go/pia-defense-by-attack, stated in footnote 1 of the abstract.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper uses publicly available datasets: 208 samples from AlpacaFarm (Dubois et al., 2024), QA dataset filtered by Li et al. (2023b) with 2000 samples, and SST2 (Socher et al., 2013). All are standard public benchmarks.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Appendix A.1 mentions PyTorch 2.1.0 and a single NVIDIA A100 GPU, but provides no requirements.txt, Dockerfile, or complete list of library versions needed to recreate the environment.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper does not describe a README or specific commands to replicate experiments.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results in Tables 1-12 are reported as point estimates (e.g., ASR percentages, accuracy percentages) with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Claims that defense methods 'outperform' baselines are based solely on comparing ASR numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used anywhere in the paper.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "ASR values are reported for both baselines and proposed methods with full context. For example, in Table 2, indirect attack ASR drops from 86.00% (no defense) to 0.05% (Fakecom-t) for Combined attack on Llama3, providing clear magnitude of improvement.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for the 208-sample AlpacaFarm subset for direct attacks or the 2000-sample QA dataset for indirect attacks. No power analysis is mentioned.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Results are single-run numbers with no variance, standard deviation, or spread measures reported. While generation uses do_sample=false (deterministic), this is not stated as justification for omitting variance.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Five training-free defense baselines are compared: Sandwich, Instructional, Reminder, Isolation, and Spotlight (Section 5.2.2). The fine-tuning-based StruQ method is also compared in Table 7.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines are from 2023-2024: Sandwich (2023), Instructional (2023), Reminder (Yi et al., 2023), Isolation (Willison, 2023), Spotlight (Hines et al., 2024), StruQ (Chen et al., 2024). These represent recent work in the field.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Section 5.4 contains extensive ablation studies: testing on closed-source models, gradient-based attacks, fake completion attack with template, attack-defense strength relationship, comparison with fine-tuning, long input handling, and impact of deleting data content.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Two metrics are used: Attack Success Rate (ASR) for security evaluation (Tables 1-2) and accuracy for model utility evaluation (Tables 3, 11). Time cost per item is also reported (Table 8).",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "All evaluation is automated. ASR checks whether the injected instruction's answer appears in the response via string matching. Accuracy checks whether the golden answer appears. No human evaluation of response quality is performed.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "The methods are training-free (prompt engineering), so no dev/validation set is used for tuning. All evaluation datasets (AlpacaFarm, QA, SST2) are used purely for testing, with no selection decisions made on them.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by attack type (Naive, Ignore, Escape, Fakecom, Combined), by victim model (Llama3, Qwen2, Llama3.1, GPT-3.5-Turbo, GPT-4o-Latest), and by attack scenario (direct vs indirect).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 5.5 provides a case study showing where attacks succeed and defenses fail. Figure 16 shows examples where the model executes both the original and injected instructions despite defense attempts.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The Escape defense shows notably worse performance than other proposed methods in several settings (e.g., Table 1, Ours-Escape on Fakecom attack for Qwen2: 70.19% ASR). The paper also notes that 'Ours-Ignore' and 'Ours-Fakecom' methods result in 'only a limited decrease in ASR' against the fake completion attack with template (Section 5.4).",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Open-source models are specified with identifiers (Llama3-8b-Instruct, Qwen2-7b-Instruct, Llama3.1-8b-Instruct). However, closed-source models are listed as 'GPT-3.5-Turbo' and 'GPT-4o-Latest' without snapshot dates or API versions, which the schema requires.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Full defense prompt structures are shown in Figure 2 (a-d) and Figures 4-8 for baselines. Attack prompt structures are shown in Figures 9-15. The exact prompt templates including shield prompts and instruction placement are provided. The only variable part is the user instruction from the dataset.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Appendix A.1 reports: do_sample=false, max_new_tokens=256, max_length=8192. These are the key generation parameters.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The defense methods are prompt engineering techniques applied directly to LLM inputs.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 5.1 describes the evaluation setup: 208 samples from AlpacaFarm following Chen et al. (2024) for direct attacks, 2000 samples from the QA dataset filtered by Li et al. (2023b) for indirect attacks, and SST2 for utility. The attack injection process is described in Section 3.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "No raw experimental outputs (model responses, per-sample ASR results) are released. Only aggregate statistics in tables are provided.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data sources are clearly described: AlpacaFarm (208 samples, Dubois et al. 2024), filtered QA dataset (2000 samples, Li et al. 2023b), SST2 (Socher et al. 2013). The evaluation protocol references Chen et al. (2024).",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. All data sources are standard public benchmarks (AlpacaFarm, QA dataset, SST2).",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline is documented: take benchmark dataset → inject attack content into data → apply defense method → run LLM with configured parameters → check output for injected answer (ASR) or golden answer (accuracy). Sections 3-5 describe each step.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "This paper tests defense methods against prompt injection attacks, not model knowledge or capability on benchmarks. Contamination of training data with benchmark answers would not affect ASR (whether injected instructions are followed).",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Same as above — the paper evaluates defense effectiveness, not model knowledge. Train/test overlap is not relevant to whether a defense prompt prevents instruction injection.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Same as above — the evaluation measures defense effectiveness (ASR), not model capability. Benchmark contamination does not affect the core evaluation metric.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study. All experiments are automated using LLMs and benchmark datasets.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants. The Ethical Consideration section acknowledges the ACM Code of Ethics but no IRB is needed.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in the study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in the study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in the study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in the study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in the study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Table 8 reports time cost per item (in seconds) for all defense methods across three models. For example, Ours-Fakecom-t costs 0.766 seconds/item on Llama3 vs. 0.645 for no defense.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Appendix A.1 mentions 'a single NVIDIA A100 GPU' but does not state total GPU hours or total compute time for all experiments. Only per-item timing is provided in Table 8.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No seed sensitivity analysis is reported. While do_sample=false makes generation deterministic, the paper does not state this as justification or test whether other sources of variance exist.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of experimental runs is never explicitly stated. Results appear to be from a single deterministic run (do_sample=false) but this is not documented as such.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No hyperparameter search is described. Generation parameters (do_sample=false, max_new_tokens=256, max_length=8192) appear to be set without any search process, but no search budget or justification is provided.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "All four proposed defense methods (Ignore, Escape, Fakecom, Fakecom-t) are reported in every experiment, not just the best one. No cherry-picking of configurations is apparent.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "The paper makes dozens of comparative claims across 4 defense methods × 5 attack methods × 3-5 models without any statistical tests or multiple comparison corrections.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors implement both their defense methods and the baselines without acknowledging potential bias in their implementation of baseline methods. The paper thanks Chen et al. (2024) for 'providing the baseline code' but does not discuss whether all baselines were implemented with equal care.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": true,
    411           "justification": "Table 8 reports time cost per item for all defense methods, allowing comparison of computational overhead versus defense performance. The overhead is shown to be negligible (all methods within 0.6-1.6 seconds/item).",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "The paper does not discuss whether the ASR metric (checking if injected answer appears in response) truly captures defense effectiveness. String matching could produce false positives/negatives. No analysis of construct validity for either AlpacaFarm or the QA dataset as prompt injection evaluation benchmarks.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No agentic scaffolding is used. Defense methods are prompt engineering techniques applied directly to LLM inputs.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "No discussion of whether the LLMs' training data includes information from AlpacaFarm, the QA dataset, or SST2 that could affect the utility evaluation results.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the evaluation setup leaks information that would not be present in real deployment scenarios.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether test examples share structural similarities or are drawn from overlapping distributions.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are used.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "Defense methods based on attack techniques outperform all existing training-free defense approaches against direct prompt injection attacks.",
    458       "evidence": "Table 1 shows all Ours-* methods achieve lower ASR than Sandwich, Instructional, Reminder, Isolation, and Spotlight across all 3 models and 5 attack types.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Defense methods reduce ASR to near zero in indirect prompt injection scenarios.",
    463       "evidence": "Table 2 shows Ours-Fakecom-t achieves 0.05-0.10% ASR across all 3 models and attack types in indirect scenarios, compared to 86-100% with no defense.",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Training-free methods are comparable to fine-tuning-based defenses (StruQ) while offering better generalization.",
    468       "evidence": "Table 7 shows Ours-Ignore matches StruQ on attacks it was trained on (0.05% vs 0.05%) but substantially outperforms it on unseen attack types (0.10% vs 35.70% on Fakecom).",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Stronger attack techniques lead to stronger corresponding defense methods.",
    473       "evidence": "Figure 3 shows positive correlation between attack effectiveness (average ASR vs defenses) and defense effectiveness (average ASR against the method) across 3 models, with one exception.",
    474       "supported": "moderate"
    475     },
    476     {
    477       "claim": "Defense methods have minimal impact on model utility.",
    478       "evidence": "Table 3 shows QA accuracy remains within 1-4 percentage points of the no-defense baseline across all defense methods and models; Table 11 confirms for sentiment analysis.",
    479       "supported": "strong"
    480     },
    481     {
    482       "claim": "Methods generalize effectively to closed-source models (GPT-3.5-Turbo, GPT-4o-Latest).",
    483       "evidence": "Table 4 shows Ours-Ignore reduces GPT-4o ASR to 0.0% on Combined and Fakecom attacks, down from 100%, outperforming all baselines.",
    484       "supported": "strong"
    485     }
    486   ],
    487   "methodology_tags": [
    488     "benchmark-eval"
    489   ],
    490   "key_findings": "The paper demonstrates that prompt injection attack techniques can be repurposed as defense mechanisms by inverting their intent: ignore prompts, escape characters, and fake completions are used to protect against injected instructions rather than exploit vulnerabilities. All four proposed methods substantially outperform five contemporary training-free baselines across both direct and indirect attack scenarios on three open-source LLMs. The Fake Completion with Template defense (Fakecom-t) is the strongest, reducing ASR to near 0% in indirect scenarios and matching or exceeding computationally expensive fine-tuning approaches while requiring no training data. A secondary finding is that stronger attack techniques tend to produce stronger defense methods, suggesting a principled path for improving defenses as attacks evolve.",
    491   "red_flags": [
    492     {
    493       "flag": "No statistical significance testing",
    494       "detail": "All results are point estimates from single evaluation runs with no confidence intervals, error bars, or variance across runs, making it impossible to assess whether observed differences are reliable."
    495     },
    496     {
    497       "flag": "Small direct-attack evaluation set",
    498       "detail": "Direct injection experiments use only 208 samples from AlpacaFarm with no power analysis or justification for this sample size."
    499     },
    500     {
    501       "flag": "No mechanistic explanation",
    502       "detail": "The paper explicitly acknowledges it cannot explain why the methods work, relying entirely on empirical demonstration without theoretical grounding."
    503     },
    504     {
    505       "flag": "ASR detection may be incomplete",
    506       "detail": "ASR is measured by checking if the injected instruction's answer appears in the response, which may miss partial compliance, indirect harms, or paraphrased attack outputs."
    507     },
    508     {
    509       "flag": "Models limited to 7-8B parameter range",
    510       "detail": "All open-source victim models are 7-8B parameters; results may not generalize to significantly larger or smaller architectures."
    511     }
    512   ],
    513   "cited_papers": [
    514     {
    515       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    516       "relevance": "Foundational work on indirect prompt injection in real-world LLM-integrated applications; defines the primary threat model this paper defends against."
    517     },
    518     {
    519       "title": "StruQ: Defending against Prompt Injection with Structured Queries",
    520       "relevance": "Primary fine-tuning-based defense baseline; the proposed training-free methods are directly compared and shown competitive against it."
    521     },
    522     {
    523       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    524       "relevance": "Original paper introducing the ignore attack technique that directly inspires the Ignore Defense method."
    525     },
    526     {
    527       "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting",
    528       "relevance": "One of five training-free defense baselines; provides the Spotlight defense and the QA evaluation dataset used in experiments."
    529     },
    530     {
    531       "title": "Evaluating the Instruction-Following Robustness of Large Language Models to Prompt Injection",
    532       "relevance": "Provides the filtered QA evaluation dataset (2000 samples) used for indirect prompt injection experiments."
    533     },
    534     {
    535       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    536       "relevance": "Provides the Combined attack baseline and formalizes the attack/defense taxonomy used throughout the paper."
    537     },
    538     {
    539       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    540       "relevance": "GCG gradient-based attack used in ablation study to test whether defense methods transfer across attack paradigms."
    541     }
    542   ],
    543   "engagement_factors": {
    544     "practical_relevance": {
    545       "score": 2,
    546       "justification": "Provides training-free, immediately deployable prompt injection defense techniques with open-source code that developers building LLM applications can integrate."
    547     },
    548     "surprise_contrarian": {
    549       "score": 2,
    550       "justification": "The core insight that attack techniques can be directly repurposed as defenses is counterintuitive and elegantly simple."
    551     },
    552     "fear_safety": {
    553       "score": 2,
    554       "justification": "Prompt injection is OWASP's #1 LLM security risk and the paper demonstrates both attack vectors and concrete defenses."
    555     },
    556     "drama_conflict": {
    557       "score": 1,
    558       "justification": "Implicitly challenges existing defense methods as inadequate but doesn't call out specific companies or create controversy."
    559     },
    560     "demo_ability": {
    561       "score": 1,
    562       "justification": "Code is publicly available on GitHub but requires setting up local LLM inference with specific models and datasets."
    563     },
    564     "brand_recognition": {
    565       "score": 1,
    566       "justification": "Authors from NUS and HKUST are recognized institutions but not household names; the paper tests on GPT-4o but isn't from a major AI lab."
    567     }
    568   },
    569   "hn_data": {
    570     "threads": [
    571       {
    572         "hn_id": "38150915",
    573         "title": "The Generative AI Paradox: \"What It Can Create, It May Not Understand\"",
    574         "points": 5,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=38150915",
    577         "created_at": "2023-11-05T13:23:46Z"
    578       },
    579       {
    580         "hn_id": "42487268",
    581         "title": "Specification-Driven Code Translation Powered by LLMs: How Far Are We?",
    582         "points": 4,
    583         "comments": 0,
    584         "url": "https://news.ycombinator.com/item?id=42487268",
    585         "created_at": "2024-12-22T16:20:09Z"
    586       },
    587       {
    588         "hn_id": "38146155",
    589         "title": "The Generative AI Paradox: \"What It Can Create, It May Not Understand\"",
    590         "points": 3,
    591         "comments": 1,
    592         "url": "https://news.ycombinator.com/item?id=38146155",
    593         "created_at": "2023-11-04T23:06:37Z"
    594       },
    595       {
    596         "hn_id": "43268036",
    597         "title": "Evolutionary Multi-Agent Reinforcement Learning in Group Social Dilemmas",
    598         "points": 2,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=43268036",
    601         "created_at": "2025-03-05T15:41:54Z"
    602       },
    603       {
    604         "hn_id": "35719730",
    605         "title": "Schrödinger cat states of a 16-microgram mechanical oscillator",
    606         "points": 1,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=35719730",
    609         "created_at": "2023-04-26T20:43:33Z"
    610       }
    611     ],
    612     "top_points": 5,
    613     "total_points": 15,
    614     "total_comments": 1
    615   }
    616 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs