ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19746B)


      1 {
      2   "paper": {
      3     "title": "CourtGuard: A Local, Multiagent Prompt Injection Classifier",
      4     "authors": ["Isaac Wu", "Michael Maslowski"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.19844"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/isaacwu2000/CourtGuard with 'full prompts' for all three models."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All three evaluation datasets (LLMail-Inject, NotInject, Qualifire) are publicly available. LLMail-Inject on HuggingFace, NotInject from Li & Liu 2024, Qualifire on HuggingFace."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is mentioned. Only model names are listed without library versions or dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction steps are described."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., '95.68%') with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CourtGuard is 'better' at classifying benign prompts and the Direct Detector is 'better' at classifying injections, but no statistical significance tests are used. Comparisons are based solely on raw percentage differences."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table 2 reports percentage differences between CourtGuard and Direct Detector (e.g., '−21.87' on LLMail-Inject), providing magnitude of differences with baseline context."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The 5,000 sample from 460,000+ LLMail-Inject attacks and the 9+10 prompt optimization samples are not justified. No power analysis or rationale for these sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Temperature is set to zero, so results are deterministic for a given input, but no variance across different samples or seeds is reported. Single-run results only."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The Direct Detector (single LLM-as-judge) serves as the primary baseline. Section 5.3 also compares against PromptGuard, LakeraGuard, GPT-4o, InjecGuard, LlamaGuard3, Sentinel, and deberta-v3-base."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary solutions like Sentinel (June 2025), InjecGuard (2024), and Constitutional Classifiers (2025)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed. The contribution of individual components (defense attorney, prosecution attorney, judge) is not isolated. The Direct Detector comparison is not an ablation since it uses a different prompt structure entirely."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Reports accuracy on multiple datasets, plus precision, recall, and F1 score on the Qualifire benchmark (Tables 1, 3, 4)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of system outputs. Section 6 provides qualitative analysis by the authors but no systematic human evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.3 describes separating 9 prompts from NotInject and 10 from LLMail-Inject for prompt optimization, evaluating on the remaining data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by dataset (LLMail-Inject, NotInject, Qualifire) and within Qualifire by benign vs. jailbreak prompts (Table 3). Also broken down by model."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6 (Qualitative Analysis) discusses specific failure modes, including how the Direct Detector assumes classifications before reasoning and how CourtGuard's deliberative approach reduces true positive rates."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper honestly reports that CourtGuard is 'generally a worse prompt injection detector' than the Direct Detector, with lower recall and F1 scores across most configurations."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims that CourtGuard has lower false positive rate but is 'generally a worse prompt injection detector' are supported by Tables 1-5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Section 6 makes causal claims that CourtGuard's lower false positive rate is 'due to' the multiagent framework forcing consideration of both classifications, and that the Direct Detector's assuming approach 'allowed it to more easily detect prompt injections.' These are speculative mechanistic explanations without controlled experiments to isolate the causal factor."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The conclusion suggests 'AI developers in data-sensitive enterprises should consider employing them for LLM applications in contact with sensitive data,' but the evaluation only covers static, single-turn prompt injections on public datasets. The title 'A Local, Multiagent Prompt Injection Classifier' also overstates generality."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper offers one mechanistic explanation (hidden reasoning vs. deliberative reasoning) but does not consider alternatives such as prompt quality differences, model-specific behaviors, or whether the multiagent overhead simply dilutes signal."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are provided: Gemma-3-12b-it, Llama-3.3-8B, Phi-4-mini-instruct. These are versioned model names with size specifications."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The abstract states 'full prompts' are available at the GitHub repository. The paper also describes prompts used for each role (defense, prosecution, judge)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Only temperature (set to zero) is mentioned. No top-p, max tokens, or other sampling parameters are reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 3 and Figure 1 describe the multiagent workflow: two attorney models run in parallel, then a judge model issues final verdict. The roles and flow are clearly described."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix A documents how LLMail-Inject subject/body fields were concatenated. Section 4.1 describes the stratified sampling of 5,000 from 460,000+ attacks. Section 4.3 documents prompt optimization split."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 is a dedicated Limitations section discussing inference time, robustness, and the limitation to static single-turn attacks."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 identifies specific threats: only tested on 'public datasets of static, singular prompt injection attacks,' real-world attacks are 'multi-turn' and 'adaptive,' and data contamination risk is mentioned."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 explicitly states that multi-turn conversations and adaptive attacks 'were not tested in this paper' and that 'CourtGuard is not robust enough to fully protect LLM systems.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The raw model outputs and per-prompt classifications are not released. Only aggregate percentages are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes each dataset's origin and purpose. The stratified sampling method for LLMail-Inject is described in Section 4.1."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants in this study. Evaluation uses public datasets."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from dataset selection through sampling, prompt optimization split, and evaluation is described across Sections 4.1-4.3 and Appendix A. Failed prompts are noted with counts (Table 1 footnotes)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 9 (Acknowledgments) credits 'Non-Trivial Ventures for providing the structured fellowship which made this paper possible.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Both authors are affiliated with Non-Trivial Ventures, listed under their names on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Non-Trivial Ventures is a fellowship program, not a company selling prompt injection products. They have no apparent financial stake in the results."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates pre-trained models on public benchmarks but does not state training data cutoff dates for any of the three models."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Section 7 briefly mentions 'data contamination' as a concern but does not analyze whether the evaluation datasets appeared in the models' training data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "NotInject and Qualifire were published before these models' likely training cutoffs. The paper mentions contamination risk in Section 7 but does not address it substantively."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper discusses inference time as a limitation (Section 7, 'can reach several seconds') but does not report actual measured latency or cost figures."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No hardware specifications, GPU type, or total compute budget is stated despite running local LLM inference."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CourtGuard has a lower false positive rate than the Direct Detector",
    286       "evidence": "Tables 1-3 show CourtGuard scores higher on NotInject (benign prompts) and Qualifire Benign for most model configurations. Mean improvement: +6.36% on NotInject, +3.45% on Qualifire Benign (Table 2).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "CourtGuard is generally a worse prompt injection detector than the Direct Detector",
    291       "evidence": "Table 2 shows mean −19.53% on LLMail-Inject and −25.36% on Qualifire Jailbreak. Table 5 shows mean F1 difference of −9.11%.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Both the Direct Detector and CourtGuard using Llama or Phi score above 90% on NotInject, exceeding PromptGuard, LakeraGuard, GPT-4o, and InjecGuard",
    296       "evidence": "Table 1 shows scores of 90.61-99.09% for Llama and Phi configurations. Comparison to other solutions cited from Li & Liu (2024) in Section 5.3.2.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "The Direct Detector's assuming approach allows it to leverage hidden reasoning for prompt injection classification",
    301       "evidence": "Section 6 provides qualitative examples of the Direct Detector stating classifications early in its response. This is speculative reasoning without controlled experiments.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CourtGuard, a multiagent prompt injection classifier using a court metaphor (defense, prosecution, judge), achieves lower false positive rates than a single-LLM Direct Detector across three local models (Gemma-3-12b-it, Llama-3.3-8B, Phi-4-mini-instruct) on two benchmarks. However, CourtGuard is substantially worse at detecting actual prompt injections, with mean recall dropping 16% on the Qualifire benchmark. The authors attribute this tradeoff to the multiagent structure forcing deliberation over both benign and malicious interpretations, though this explanation is speculative.",
    307   "red_flags": [
    308     {
    309       "flag": "No statistical tests on comparative claims",
    310       "detail": "All claims of one system being 'better' than another are based on raw percentage comparisons without significance tests, confidence intervals, or any uncertainty quantification."
    311     },
    312     {
    313       "flag": "Prompt optimization on test data subsets",
    314       "detail": "Prompts were optimized on small subsets (9 from NotInject, 10 from LLMail-Inject) drawn from the same datasets used for evaluation. While the optimized samples were excluded, this still risks overfitting to dataset characteristics."
    315     },
    316     {
    317       "flag": "Speculative causal mechanism",
    318       "detail": "The explanation that the Direct Detector leverages 'hidden reasoning' and CourtGuard forces 'deliberation' is presented as a likely explanation but is unsupported by controlled experiments or ablations."
    319     },
    320     {
    321       "flag": "No ablation of multiagent components",
    322       "detail": "The contribution of individual agents (defense, prosecution, judge) is never isolated. It is unclear whether the three-agent structure is necessary or whether a single agent with a modified prompt could achieve similar results."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Constitutional classifiers: Defending against universal jailbreaks across thousands of hours of red teaming",
    328       "authors": ["M. Sharma"],
    329       "year": 2025,
    330       "arxiv_id": "2501.18837",
    331       "relevance": "Major LLM safety defense by Anthropic, directly relevant to prompt injection and jailbreak prevention."
    332     },
    333     {
    334       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    335       "authors": ["Q. Zhan", "R. Fang", "H. S. Panchal", "D. Kang"],
    336       "year": 2025,
    337       "arxiv_id": "2503.00061",
    338       "relevance": "Demonstrates adaptive attacks defeating prompt injection defenses, relevant to LLM agent security evaluation."
    339     },
    340     {
    341       "title": "Defeating prompt injections by design",
    342       "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan"],
    343       "year": 2025,
    344       "arxiv_id": "2503.18813",
    345       "relevance": "CaMeL system for architectural prompt injection defense, relevant to agentic AI safety."
    346     },
    347     {
    348       "title": "LLMail-inject: A dataset from a realistic adaptive prompt injection challenge",
    349       "authors": ["S. Abdelnabi"],
    350       "year": 2025,
    351       "arxiv_id": "2506.09956",
    352       "relevance": "Major prompt injection benchmark dataset from Microsoft, relevant to evaluation methodology."
    353     },
    354     {
    355       "title": "InjecGuard: Benchmarking and mitigating over-defense in prompt injection guardrail models",
    356       "authors": ["H. Li", "X. Liu"],
    357       "year": 2024,
    358       "arxiv_id": "2410.22770",
    359       "relevance": "Addresses false positive problem in prompt injection detection, directly relevant to LLM safety evaluation."
    360     },
    361     {
    362       "title": "Prompt injection detection and mitigation via AI multi-agent NLP frameworks",
    363       "authors": ["D. Gosmar", "D. A. Dahl"],
    364       "year": 2025,
    365       "arxiv_id": "2503.11517",
    366       "relevance": "Another multiagent approach to prompt injection defense, directly comparable methodology."
    367     },
    368     {
    369       "title": "Reasoning Models Don't Always Say What They Think",
    370       "authors": ["Y. Chen"],
    371       "year": 2025,
    372       "relevance": "Anthropic research on hidden reasoning in LLMs, cited to explain CourtGuard's mechanism."
    373     },
    374     {
    375       "title": "Backdoored retrievers for prompt injection attacks on retrieval augmented generation of large language models",
    376       "authors": ["C. Clop", "Y. Teglia"],
    377       "year": 2024,
    378       "arxiv_id": "2410.14479",
    379       "relevance": "RAG-based prompt injection attacks, relevant to LLM security."
    380     }
    381   ]
    382 }

Impressum · Datenschutz