ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (20008B)


      1 {
      2   "paper": {
      3     "title": "Cross-LLM Generalization of Behavioral Backdoor Detection in AI Agent Supply Chains",
      4     "authors": ["Arun Chowdary Sanna"],
      5     "year": 2025,
      6     "venue": "Preprint",
      7     "doi": null
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided: https://github.com/arunsanna/cross-llm-backdoor-detection (Section V-G and Section IX-D)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Paper states 'We release our multi-LLM behavioral trace dataset and detection framework' (contribution 5) and 'We release our code, data, and reproducibility package' (Section IX-D)."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section V-G specifies Python 3.10, scikit-learn 1.3.0, Intel i7, 32GB RAM, CPU-only. This is sufficient to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While code and data are released, the paper itself does not include step-by-step reproduction instructions. No README content or reproduction commands are described in the paper."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 92.7%, 49.2%, 90.6%) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section VI-C reports p < 0.001 for model-aware vs ensemble voting comparison."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Cohen's d reported for ensemble comparison (d = 1.87, Section VI-C) and per-model discriminative features (Table VI, d = 0.18-0.33)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 200 traces per model is sufficient. No power analysis. The limitations section acknowledges the dataset scale issue but provides no justification."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations or variance across experimental runs reported. Results appear to be from a single train/test split with fixed seed (42). No multi-run results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table I compares against BadAgent and AgentPoison. Table VII compares four detection strategies (single-model, pooled, ensemble voting, model-aware)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "BadAgent (2024) and AgentPoison (2024) are recent and represent the state of the art in agent backdoor detection."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ2 analyzes feature stability by category (Table V) and identifies which feature categories drive performance. RQ3 compares four detection strategies as ablations of the approach."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Reports accuracy, F1-score, AUC-ROC, precision, recall, and generalization gap (Section V-D, Tables III and VII)."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a ML classification study on execution traces. Human evaluation of system outputs is not relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section V-F describes 80/20 train/test split with stratified sampling and fixed random seed (42)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table III provides per-model precision/recall/F1. Table V provides per-feature-category stability analysis. The 6x6 matrix (Fig. 1) shows per-model-pair results."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The cross-model failure (49.2% accuracy) is the central finding. Section VI-A discusses GPT-5.1's higher FN rate (15%). Section VII-C discusses false positive/negative trade-offs."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Ensemble voting performing poorly (62.8%) is a negative result. The entire cross-model generalization gap (49.2%) is a negative finding about single-model detectors."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (92.7% same-model, 49.2% cross-model, 43.4% gap, 90.6% model-aware) are all supported by results in Section VI and Tables III/VII."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper claims temporal features cause the generalization gap (RQ2). This is supported by CV analysis showing temporal features have CV > 0.8 while structural features have CV < 0.2. The ablation across feature categories provides adequate evidence for this causal claim."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section VIII explicitly bounds generalization: synthetic backdoors (not real attacks), non-adaptive adversaries, 6 LLMs only, temporal snapshot (Nov 2025), infrastructure confounding. The paper is careful about scope."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section VIII-f discusses infrastructure confounding as an alternative explanation for temporal feature instability: timing features may capture provider hardware/network differences rather than model-specific patterns."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are identified as 'GPT-5.1', 'Claude Sonnet 4.5', 'Grok 4.1 Fast', 'Llama 4 Maverick', 'GPT-OSS 120B', 'DeepSeek Chat V3.1'. These are marketing names without snapshot dates or API version identifiers."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes backdoor injection (few-shot examples, trigger conditions) but does not provide the actual prompt text or system instructions used to generate traces."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section IV-C reports SVM (linear kernel, C=1.0, balanced class weights) and Random Forest (100 estimators, max_depth=10, balanced class weights). LLM API temperature/sampling settings are not reported, but the ML classifier hyperparameters are the primary ones for this study."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper uses agent scaffolding to generate traces (agents performing tasks with tool invocations) but does not describe the agent scaffold architecture, tool implementations, or workflow details."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-B describes 51 features across 4 categories with specific feature definitions. Section V-B describes backdoor generation methodology with trigger conditions and validation criteria. Z-score normalization mentioned in Section IV-C."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VIII is a dedicated limitations section with 8 specific limitations (a through h)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Limitations are specific: synthetic backdoors vs real attacks, non-adaptive adversaries, infrastructure confounding of timing features, temporal validity of Nov 2025 snapshot, model identification assumption. These are study-specific, not boilerplate."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section III-D explicitly states out-of-scope items. Section VIII identifies specific things not tested: adaptive adversaries, weight-level backdoors, large-scale deployment, model architectures beyond the 6 tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Paper states data is released at the GitHub repository (Section IX-D: 'We release our code, data, and reproducibility package')."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section V-A describes collection via OpenRouter API, 6 LLMs, 100 tasks from AgentBench, 200 traces per model (100 benign + 100 backdoor). Section V-B describes backdoor generation methodology."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is synthetically generated execution traces from LLM APIs."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Pipeline described: trace collection via API → feature extraction (51 features, 4 categories) → z-score normalization → classification. Backdoor validation criteria stated (Section V-B-c)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliation clearly stated: 'Enterprise AI Architect, Precise Software Solutions'."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed. Cannot assess funder independence without knowing funding source."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate pre-trained model capability on a benchmark. It trains ML classifiers (SVM, Random Forest) on execution traces to detect backdoors. The LLMs are used to generate traces, not evaluated on knowledge tasks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — no pre-trained model benchmark evaluation. The train/test split for the ML classifiers is described (80/20 stratified)."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — the study trains its own classifiers on collected traces rather than evaluating pre-trained model knowledge."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section IV-C states 'feature extraction and classification complete in under 1ms.' Section VII-C provides operational cost analysis (false positive analyst burden)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is stated (Intel i7, 32GB RAM) but total compute budget (API costs for generating 1,198 traces across 6 LLMs, training time) is not reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Single-model backdoor detectors achieve 92.7% accuracy within their training distribution but only 49.2% across different LLMs, a 43.4 percentage point generalization gap.",
    286       "evidence": "6x6 cross-LLM detection matrix (Figure 1, Section VI-A) showing diagonal vs off-diagonal accuracy across 36 experiments.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Temporal features (CV > 0.8) cause the cross-LLM generalization gap while structural features remain stable (CV < 0.2).",
    291       "evidence": "Table IV shows feature stability analysis with CV values. Table V shows distribution by category (50% of temporal features unstable vs 0% of sequence features).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Model-aware detection achieves 90.6% universal accuracy across all evaluated models.",
    296       "evidence": "Table VII and Figure 2 (Section VI-C). Statistical significance reported: p < 0.001 vs ensemble voting, Cohen's d = 1.87.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "This is the first systematic study of cross-LLM behavioral backdoor detection.",
    301       "evidence": "Table I compares with BadAgent and AgentPoison, showing neither evaluated cross-model detection. Section II-E provides positioning.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "Single-model behavioral backdoor detectors fail catastrophically when applied to LLMs other than the one they were trained on (92.7% same-model accuracy drops to 49.2% cross-model, equivalent to random guessing). The root cause is temporal feature instability across LLM architectures (CV > 0.8). Model-aware detection, which incorporates model identity as an additional feature, achieves 90.6% universal accuracy across all 6 tested LLMs. The study evaluates 1,198 execution traces across 6 production LLMs from 5 providers.",
    307   "red_flags": [
    308     {
    309       "flag": "Synthetic backdoors only",
    310       "detail": "All backdoor traces are synthetically generated via controlled injection (few-shot poisoning and tool replacement), not captured from real attacks. The paper acknowledges this but the ecological validity of the detection results is uncertain."
    311     },
    312     {
    313       "flag": "Single train/test split",
    314       "detail": "Results are from a single 80/20 split with fixed seed (42). No cross-validation or multi-run variance reported. The 92.7% and 49.2% figures could be sensitive to the particular split."
    315     },
    316     {
    317       "flag": "Infrastructure confounding acknowledged but unresolved",
    318       "detail": "Section VIII-f acknowledges that timing features may capture provider infrastructure differences rather than model-specific behavioral patterns, since all LLMs were accessed via OpenRouter on different provider hardware. This could explain the temporal instability finding."
    319     },
    320     {
    321       "flag": "Small per-model sample size",
    322       "detail": "200 traces per model (100 benign + 100 backdoor) with only 40 traces in each test set. No power analysis or sample size justification provided."
    323     },
    324     {
    325       "flag": "Solo author with industry affiliation",
    326       "detail": "Single author from 'Precise Software Solutions' with no funding disclosure, competing interests statement, or co-author review. No peer review venue indicated."
    327     }
    328   ],
    329   "cited_papers": [
    330     {
    331       "title": "Malice in Agentland: Down the Rabbit Hole of Backdoors in the AI Supply Chain",
    332       "authors": ["L. Boisvert", "A. Puri", "C. K. R. Evuru"],
    333       "year": 2024,
    334       "arxiv_id": "2510.05159",
    335       "relevance": "Comprehensive analysis of backdoors across the AI agent supply chain, directly relevant to agent security evaluation."
    336     },
    337     {
    338       "title": "BadAgent: Inserting and Activating Backdoor Attacks in LLM Agents",
    339       "authors": ["Y. Wang", "D. Xue", "S. Zhang", "S. Qian"],
    340       "year": 2024,
    341       "arxiv_id": "2406.03007",
    342       "relevance": "Demonstrates backdoor insertion through code manipulation in agent workflows, key baseline for agent backdoor detection."
    343     },
    344     {
    345       "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    346       "authors": ["Z. Chen", "Z. Xiang", "C. Xiao", "D. Song", "B. Li"],
    347       "year": 2024,
    348       "relevance": "Agent memory/knowledge base poisoning attack with watermarking-based detection, key baseline for this work."
    349     },
    350     {
    351       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    352       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    353       "year": 2024,
    354       "relevance": "Shows backdoored models maintain malicious behavior after safety training, relevant to AI safety and alignment research."
    355     },
    356     {
    357       "title": "Poisoning Web-Scale Training Datasets is Practical",
    358       "authors": ["N. Carlini", "M. Jagielski"],
    359       "year": 2024,
    360       "relevance": "Demonstrates practical data poisoning at web scale requiring only 0.01% modification, foundational for supply chain attack research."
    361     },
    362     {
    363       "title": "AgentBench: Evaluating LLMs as Agents",
    364       "authors": ["X. Liu", "H. Yu", "H. Zhang"],
    365       "year": 2023,
    366       "arxiv_id": "2308.03688",
    367       "relevance": "Benchmark for evaluating LLMs as agents, used as task source in this study."
    368     },
    369     {
    370       "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage",
    371       "authors": ["Z. Liao", "L. Mo", "C. Xu"],
    372       "year": 2024,
    373       "arxiv_id": "2409.11295",
    374       "relevance": "Environmental injection attacks on web agents causing privacy leakage, relevant to agent security threat landscape."
    375     },
    376     {
    377       "title": "LlamaFirewall: An Open Source Guardrail System for Building Secure AI Agents",
    378       "authors": ["S. Chennabasappa", "C. Nikolaidis", "D. Song"],
    379       "year": 2025,
    380       "relevance": "Open source guardrail system for agent security, relevant to defense mechanisms against agent attacks."
    381     },
    382     {
    383       "title": "Backdoor Attacks for In-Context Learning with Language Models",
    384       "authors": ["N. Kandpal", "M. Jagielski", "F. Tramer", "N. Carlini"],
    385       "year": 2023,
    386       "arxiv_id": "2307.14692",
    387       "relevance": "In-context learning backdoor attacks on language models, relevant to LLM security research."
    388     }
    389   ]
    390 }

Impressum · Datenschutz