ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (22724B)


      1 {
      2   "paper": {
      3     "title": "Breaking the Prompt Wall (I): A Real-World Case Study of Attacking ChatGPT via Lightweight Prompt Injection",
      4     "authors": ["Xiangyu Chang", "Guang Dai", "Hao Di", "Haishan Ye"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2504.16125"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code or repository link is provided. The paper shows prompt templates and screenshots but does not release any code or scripts for reproducing the attacks."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset is released. The prompt templates are shown in figures (Figures 1, 3) and Appendix A, but there is no downloadable archive or structured dataset of the attack prompts and responses."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications are provided. The paper mentions using ChatGPT-4o and o3-mini but provides no details about the interface version, API settings, or any software dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. The paper describes the attack scenarios in natural language and shows screenshots but does not give a reproducible procedure."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported. The paper presents only qualitative screenshots of three individual attack demonstrations with no quantitative metrics at all."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper makes no comparative statistical claims. It presents qualitative case studies demonstrating that attacks are possible, not quantitative comparisons between methods."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. The paper does not quantify attack success rates, measure the degree of bias introduced, or provide any numerical metrics for the demonstrated attacks."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No sample size justification is given. The paper presents exactly three case studies with no discussion of why three cases are sufficient or what coverage this provides across the threat landscape."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance or multiple-run results are reported. Each case study appears to show a single interaction screenshot with no repeated trials."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No baselines are included. The paper does not compare the template-based injection against other injection methods, random prompts, or a no-injection control condition."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are used at all, so the question of whether they are contemporary is moot. The paper does not compare against any existing prompt injection techniques."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed. The paper does not test which components of the template are necessary for successful injection (e.g., removing individual rules, changing placement)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No evaluation metrics are used at all. The paper relies entirely on qualitative screenshots showing that the attacks produced biased outputs, with no quantitative measurement of success rate, bias degree, or detection evasion."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation is conducted. The paper's claims about biased outputs being misleading to users are asserted based on screenshots, not validated by having actual users evaluate the outputs."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a qualitative case study with no train/test split or benchmark evaluation. The concept of a held-out test set does not apply."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "While three distinct attack scenarios are presented (user input, web search, GPT agent), there is no quantitative breakdown of results across categories, models, or prompt variations."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No failure cases are discussed. The paper shows only successful attacks and does not mention any scenarios where the injection was detected, blocked, or failed to influence the output."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results are reported. Every case study presented shows a successful attack. There is no mention of failed attempts, blocked prompts, or conditions under which the attacks did not work."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims that the attacks 'can cause persistent and misleading behaviors' and that 'commercial-grade LLMs remain vulnerable to subtle manipulations that bypass safety filters.' These claims are demonstrated only by three cherry-picked screenshots, not by systematic evidence. The claim of persistence is particularly unsupported — no multi-turn persistence testing is shown beyond single-session screenshots."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims that injected prompts 'cause' biased outputs, but demonstrates this with only single-run screenshots without controlling for confounds. There is no controlled comparison showing that the same queries WITHOUT the injection produce unbiased outputs."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper generalizes broadly about 'LLM platforms' and 'commercial-grade LLMs' in the abstract but tests only ChatGPT (GPT-4o and o3-mini). The title says 'ChatGPT' but the abstract and introduction frame the problem as applying to LLMs generally. Section 2.1 claims transferability across architectures citing Andriushchenko et al. but does not test this."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed. The paper does not consider whether the observed biased outputs could have alternative explanations (e.g., the model simply following explicit instructions in uploaded documents as designed, rather than a 'vulnerability')."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper mentions 'ChatGPT-4o' and 'o3-mini' but does not specify exact model versions, snapshot dates, or API versions. These are marketing names without version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The injection prompt templates are provided in Figures 1 and 3, and the full template for biased judgment is provided in Appendix A. The user queries used in each case study are also explicitly stated in the text."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters are reported. The paper does not mention temperature, top-p, max tokens, or any other inference settings used when interacting with ChatGPT."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use agentic scaffolding as part of its methodology. It interacts directly with ChatGPT through the standard interface."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No data preprocessing is documented because there is no formal dataset. The paper does not describe how the case studies were selected, how the prompt templates were developed, or any systematic process for constructing the attacks."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no limitations section. The paper has only Introduction, Injection Framework, Case Study, and Conclusion sections. The conclusion mentions the work is 'responsible disclosure' but does not discuss limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of the limited sample size, lack of controlled experiments, or potential for the demonstrations to be non-representative."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No scope boundaries are stated. The paper does not specify what it does NOT show — for instance, that results may not apply to other LLM platforms, that the attacks were tested on specific dates and model behaviors may change, or that three cases cannot establish systematic vulnerability rates."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data (full conversation logs, complete model responses) is not available. Only screenshot excerpts are shown in figures."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The data collection procedure is not described systematically. The paper presents three examples but does not describe when the experiments were conducted, how many attempts were made, or what criteria determined a successful attack."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. This is a technical demonstration using ChatGPT's interface."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No data pipeline is documented. The paper does not describe the process from constructing templates to running experiments to selecting which results to present."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information is disclosed. The Acknowledgement section (Section 5) thanks a law professor and ChatGPT but does not mention any funding source."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are disclosed: Xi'an Jiaotong University (School of Management) and SGIT AI Lab. These are listed in the author information section."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of a funding disclosure statement is itself a gap."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is provided. One author (Haishan Ye) is affiliated with both the university and SGIT AI Lab, but no declaration of interests is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper tests prompt injection attacks (security vulnerabilities) rather than evaluating a model's capability on a benchmark. Training cutoff is irrelevant to this type of study."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable. The paper does not evaluate model knowledge on a benchmark where train/test overlap would be a concern."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable. The paper does not use any benchmark dataset. It demonstrates prompt injection through ad hoc case studies."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study. It is a technical demonstration of prompt injection attacks."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved. The study interacts only with ChatGPT."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost is reported. The paper describes the attacks as 'lightweight and low-cost' in the abstract but provides no actual cost figures, token counts, or API usage data."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget is stated. Given the paper claims the attacks are 'lightweight,' quantifying the actual cost would strengthen this claim, but no figures are provided."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Adversarial prompts can be injected via user inputs, web-based retrieval, and system-level agent instructions to cause persistent and misleading behaviors in LLM outputs.",
    286       "evidence": "Three case studies demonstrated in Section 3: Case 1 shows injection via uploaded PDF influencing ChatGPT-4o's peer review (Figure 2), Case 2 shows injection via modified webpage influencing o3-mini's search results (Figure 4-5), Case 3 shows injection via GPTs system instructions biasing a custom agent's recommendations (Figures 7-8).",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "Even commercial-grade LLMs remain vulnerable to subtle manipulations that bypass safety filters and influence user decisions.",
    291       "evidence": "Three screenshots showing ChatGPT producing biased outputs after injection. However, no systematic evaluation, no success rate measurement, no failure cases, and no comparison across multiple models or conditions.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "The template-based prompting strategy offers semantic stealthiness, modular reusability, and transferability across LLM architectures.",
    296       "evidence": "Section 2.1 describes these properties conceptually and cites Andriushchenko et al. (2024) for transferability. However, the paper itself does not test transferability across architectures — it only tests on ChatGPT variants.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The attacks are lightweight and low-cost.",
    301       "evidence": "Stated in the abstract and implied by the simplicity of the template approach, but no actual cost measurement, token counts, or effort quantification is provided.",
    302       "supported": "unsupported"
    303     }
    304   ],
    305   "methodology_tags": ["case-study"],
    306   "key_findings": "The paper demonstrates three prompt injection attack vectors against ChatGPT: direct user input (uploaded PDF with hidden instructions biases peer review output), web search context (modified webpage causes o3-mini to repeat fabricated product claims), and GPT agent system instructions (hidden rules in a custom agent bias product recommendations). All three demonstrations show ChatGPT producing biased outputs when adversarial prompts are embedded in the input context. However, the evidence consists entirely of individual screenshots from single-run demonstrations with no quantitative evaluation, no controlled comparisons, and no failure analysis.",
    307   "red_flags": [
    308     {
    309       "flag": "No quantitative evaluation",
    310       "detail": "The entire paper relies on three case study screenshots. There are no success rates, no repeated trials, no controlled experiments, and no metrics of any kind. It is impossible to assess how reliable or generalizable these attacks are."
    311     },
    312     {
    313       "flag": "No failure cases or negative results",
    314       "detail": "Every case study shown is a success. The paper does not report any cases where the injection failed, was detected, or produced only partial bias. This creates a highly selective presentation."
    315     },
    316     {
    317       "flag": "No baselines or controlled comparisons",
    318       "detail": "The paper never shows what ChatGPT would have produced WITHOUT the injection. Without a control condition, the reader cannot assess the counterfactual — perhaps the model would have produced similar outputs anyway."
    319     },
    320     {
    321       "flag": "Claims significantly outrun evidence",
    322       "detail": "The abstract and introduction make broad claims about 'commercial-grade LLMs' being vulnerable, but the evidence is limited to three cherry-picked demonstrations on a single platform (ChatGPT) with no systematic testing."
    323     },
    324     {
    325       "flag": "No limitations section",
    326       "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. A responsible security disclosure should at minimum acknowledge the limited scope of three case studies."
    327     },
    328     {
    329       "flag": "Well-known vulnerabilities presented as novel",
    330       "detail": "The three attack vectors demonstrated (direct injection, indirect injection via retrieval, system prompt injection) are well-documented in prior work such as Greshake et al. (2023). The paper's contribution beyond existing knowledge is unclear."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Jailbreaking leading safety-aligned llms with simple adaptive attacks",
    336       "authors": ["Maksym Andriushchenko", "Francesco Croce", "Nicolas Flammarion"],
    337       "year": 2024,
    338       "arxiv_id": "2404.02151",
    339       "relevance": "Directly relevant study on adaptive jailbreak attacks against safety-aligned LLMs, demonstrating transferability of prompt attacks across models."
    340     },
    341     {
    342       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    343       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    344       "year": 2023,
    345       "doi": "10.1145/3605764.3623985",
    346       "relevance": "Foundational work on indirect prompt injection in LLM-integrated applications, directly relevant to the attack vectors demonstrated in this paper."
    347     },
    348     {
    349       "title": "Safety at scale: A comprehensive survey of large model safety",
    350       "authors": ["Xingjun Ma", "Yifeng Gao", "Yixu Wang"],
    351       "year": 2025,
    352       "arxiv_id": "2502.05206",
    353       "relevance": "Comprehensive survey of LLM safety threats including prompt injection, relevant to the broader security landscape this paper addresses."
    354     },
    355     {
    356       "title": "ProAdvPrompter: A two-stage journey to effective adversarial prompting for LLMs",
    357       "authors": ["Hao Di", "Tong He", "Haishan Ye"],
    358       "year": 2025,
    359       "relevance": "Related work on adversarial prompting by overlapping authors, published at ICLR 2025, relevant to prompt attack methodology."
    360     },
    361     {
    362       "title": "Can llm feedback enhance review quality? a randomized study of 20k reviews at iclr 2025",
    363       "authors": ["Nitya Thakkar", "Mert Yuksekgonul", "Jake Silberg"],
    364       "year": 2025,
    365       "arxiv_id": "2504.09737",
    366       "relevance": "Study on LLM use in peer review at ICLR 2025, directly relevant to the paper's Case 1 demonstrating manipulation of LLM-based review."
    367     },
    368     {
    369       "title": "Are we there yet? revealing the risks of utilizing large language models in scholarly peer review",
    370       "authors": ["Rui Ye", "Xianghe Pang", "Jingyi Chai", "Jiaao Chen"],
    371       "year": 2024,
    372       "arxiv_id": "2412.01708",
    373       "relevance": "Study on risks of LLMs in peer review, directly cited as motivation for the biased judgment attack scenario."
    374     },
    375     {
    376       "title": "BloombergGPT: A large language model for finance",
    377       "authors": ["Shijie Wu", "Ozan Irsoy", "Steven Lu", "Vadim Dabravolski"],
    378       "year": 2023,
    379       "arxiv_id": "2303.17564",
    380       "relevance": "Domain-specific financial LLM cited as an example of systems vulnerable to the financial information manipulation attack."
    381     },
    382     {
    383       "title": "FinGPT: Open-source financial large language models",
    384       "authors": ["Hongyang Yang", "Xiao-Yang Liu", "Christina Dan Wang"],
    385       "year": 2023,
    386       "arxiv_id": "2306.06031",
    387       "relevance": "Open-source financial LLM cited as another system potentially vulnerable to the prompt injection attacks on financial information."
    388     }
    389   ]
    390 }

Impressum · Datenschutz