scan.json (26666B)
1 { 2 "paper": { 3 "title": "Policy Compiler for Secure Agentic Systems", 4 "authors": ["Nils Palumbo", "Sarthak Choudhary", "Jihye Choi", "Prasad Chalasani", "Somesh Jha"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.16708" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval", "case-study"], 12 "key_findings": "PCAS enforces declarative Datalog-based authorization policies over a dependency graph of multi-agent interactions, providing deterministic policy compliance independent of model reasoning. On τ2-bench customer service tasks, PCAS improves policy compliance from 48% to 93% across Claude Opus 4.5, GPT-5.2, and Gemini 3 Pro. Prompt-embedded policies fail completely against prompt injection (100% attack success rate) while PCAS achieves 0%. Runtime overhead is modest (~20% latency increase, negligible cost increase).", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "Footnote 1 states 'Code for evaluations will be released soon.' No working URL or archive is provided." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The evaluation uses publicly available benchmarks: τ2-bench and MALADE. The prompt injection scenario is fully described in the paper with the exact attack text provided in §5.2." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, dependency files, or hardware details are provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No reproduction instructions or scripts are provided. The code is not yet released." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Results are reported as pass counts (e.g., 5/5, 0/5) with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims PCAS 'improves compliance from 48% to 93%' and reports per-model improvements without any statistical significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports relative improvements with baseline context: '1.68–2.93× over a baseline' (§1), pass rates from specific baselines to instrumented values (e.g., Table 5), and absolute cost/latency numbers (Tables 4b, 6)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "5 trials per condition is used throughout with no justification for why 5 is sufficient. No power analysis." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Results are reported as counts over 5 trials (e.g., 3/5) with no standard deviation or variance across runs." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "All three case studies compare instrumented (PCAS) vs. non-instrumented (prompt-based policy) configurations, providing a clear controlled baseline comparison." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines use frontier models: Claude Opus 4.5, GPT-5.2, Gemini 3 Pro. The comparison table (Table 1) includes contemporary systems like Progent, NeMo Guardrails, Invariant Guardrails, ShieldAgent, FIDES." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "No ablation study is performed. The system has multiple components (dependency graph, reference monitor, policy engine) but no experiments isolate their individual contributions." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics are reported: task pass rate, policy compliance rate, attack success rate, runtime latency, and API cost (Tables 4-7)." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation is not relevant — the claims are about deterministic policy enforcement, which is verified automatically." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is not a learning-based system. There is no training or dev/test split to consider." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per task (6 tasks across 2 domains in τ2-bench, Table 5), per model, and per case study. Per-configuration breakdowns for prompt injection are provided in Table 4." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "§5.3 RQ1 discusses GPT-5.2 failures in detail: wrong payment method, missing passenger, premature partial returns. §5.1 distinguishes policy violations from reasoning errors." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "GPT-5.2 shows degraded performance on some instrumented tasks (booking drops from 4/5 to 2/5, Table 5), and the paper honestly reports this and attributes it to reasoning errors rather than enforcement." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims 'compliance from 48% to 93%' and 'zero policy violations in instrumented runs,' both supported by Tables 5 and 7. The abstract claim of 'deterministic enforcement' is supported by the formal framework in §3." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The causal claim that PCAS improves compliance is justified by the controlled comparison: instrumented vs. non-instrumented agents on the same tasks with the same models, isolating the enforcement mechanism (§5 methodology)." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Secure Agentic Systems' generally, but evaluation covers only 3 case studies with specific domains (customer service, prompt injection, pharmacovigilance). The paper does not bound its claims to these specific settings." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "§5.1 explicitly distinguishes policy violations from reasoning errors, acknowledging that task failures may be due to model limitations rather than enforcement issues. §5.3 RQ1 analyzes whether failures are enforcement-induced regressions." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures what it claims: policy compliance (was the action blocked?) and task success (did the agent complete the task?). These are directly measured, not proxied. §5.1 explicitly distinguishes the two failure modes." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are listed as 'Claude Opus 4.5', 'GPT-5.2', 'Gemini 3 Pro', 'GPT-4.1-mini', 'GPT-4.1' — marketing names without snapshot dates or API version identifiers." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Complete Datalog policy rules are provided in §5.2-5.4 and Appendix A. The prompt injection attack text is provided verbatim in §5.2. The anti-exfiltration prompt baseline is referenced but not fully provided." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Temperature is stated for all experiments: T=0.2 for prompt injection (§5.2) and MALADE (§5.4), T=0 for τ2-bench (§5.3)." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The PCAS architecture is described in detail: dependency graph, reference monitor, policy engine, instrumentation layer, authorization flow (§4, Figures 1a-1b, Algorithm 1). MALADE's multi-agent architecture is described in §5.4." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "§5.3 describes task selection criteria ('three representative tasks from each domain'), domain selection rationale (omitting telecom due to saturation), and evaluation configuration details." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "§6.1 'Limitations' is a dedicated subsection discussing dependency tracking coverage, policy specification effort, and the gap between enforcement and reasoning." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "§6.1 discusses specific threats: agents with code execution may use side channels bypassing instrumentation, and natural language to Datalog translation requires careful manual review. §4.2 discusses scope limitations around enforcement vs. reasoning." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "§4.1 clearly defines the threat model and TCB. §4.2 'Current Scope' states what PCAS does not address: automatic policy synthesis from natural language, reasoning errors, and recovery failures." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw experimental data (logs, traces, agent outputs) is made available. Only aggregate results are reported in tables." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "The evaluation procedure is well-described: 5 trials per model per configuration, specific tasks from τ2-bench and MALADE, exact attack text for prompt injection (§5.2-5.4)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard benchmarks (τ2-bench) and existing systems (MALADE)." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline from task selection through execution to measurement is documented. §5 describes the methodology, configurations, and how pass/fail is determined." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: University of Wisconsin–Madison and Langroid." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Langroid, a company that could benefit from the work." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present. One author (Prasad Chalasani) is affiliated with Langroid, which appears to be a company related to LLM agents, but no declaration of financial interests is made." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "PCAS is not evaluating pre-trained model capability on benchmarks. It evaluates a runtime enforcement mechanism. The models are used as components, not the subjects being benchmarked." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Same as above — the paper tests a policy enforcement system, not model knowledge." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Same as above — contamination is not relevant when the evaluation measures policy compliance enforcement, not model capability." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in the study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in the study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "API costs are reported per trial in Tables 4b, 6b, and 7b for all configurations, with both instrumented and non-instrumented costs." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Runtime per trial is reported in Tables 4b, 6a, 7b. Total trial counts are given (180 for τ2-bench, 30 for MALADE, 20 for prompt injection). Per-trial costs allow computing total budget." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "5 trials are run per configuration but no analysis of sensitivity to random seeds or variance across trials is provided. Results are reported as aggregate counts." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "The number of runs is clearly stated: '5 independent trials per model per configuration' for τ2-bench (§5.3), 'five independent trials' for prompt injection (§5.2), '5 independent trials per configuration' for MALADE (§5.4)." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": false, 304 "answer": false, 305 "justification": "No hyperparameter tuning is performed — the system uses fixed Datalog policies and fixed model temperatures. There is nothing to search over." 306 }, 307 "best_config_selection_justified": { 308 "applies": false, 309 "answer": false, 310 "justification": "No configuration selection from multiple options. The system and baselines each have one fixed configuration." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many model×task×configuration comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors evaluate their own PCAS system against baselines they configured. No discussion of potential author-evaluation bias." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Tables 6a-6b and 7b explicitly compare runtime and cost between instrumented and non-instrumented configurations, showing the overhead of enforcement (~20% latency, negligible cost)." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "§5.3 discusses why τ2-bench is well-suited (policy-intensive domains, complex conversational dynamics). The paper justifies omitting the telecom domain (already saturated). Task selection is explained per domain." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "The instrumented vs. non-instrumented comparison is the core experimental design — the scaffold (PCAS) IS the variable being tested, and the same models are used in both conditions." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "The paper evaluates a policy enforcement system, not model knowledge. Whether models have seen τ2-bench tasks is irrelevant — the measure is policy compliance under enforcement, not model capability." 343 }, 344 "feature_leakage_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "Same reasoning — the evaluation measures enforcement effectiveness, not model capability on benchmark tasks." 348 }, 349 "non_independence_addressed": { 350 "applies": false, 351 "answer": false, 352 "justification": "Not applicable — no train/test split or learning involved in PCAS." 353 }, 354 "leakage_detection_method": { 355 "applies": false, 356 "answer": false, 357 "justification": "Not applicable — PCAS is not a learned system and leakage is not a meaningful threat to the evaluation." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "PCAS improves policy compliance from 48% to 93% on τ2-bench customer service tasks across frontier models", 364 "evidence": "Table 5 shows per-task pass rates: non-instrumented total 43/90 (48%), instrumented 84/90 (93%). Breakdown by model and task provided.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "PCAS achieves zero policy violations in instrumented runs across all case studies", 369 "evidence": "§5.1-5.4: all failures in instrumented runs are attributed to reasoning errors (wrong payment method, missing passenger), not policy violations. cancel_reservation blocked 25 times, book_reservation blocked 5 times with no false positives (§5.3).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Prompt-embedded policies fail to prevent prompt injection attacks (100% ASR vs 0% with PCAS)", 374 "evidence": "Table 4a: non-instrumented baseline has 5/5 attack success rate; all three instrumented configurations achieve 0/5 ASR.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Runtime overhead of enforcement is modest (~20% latency, negligible cost increase)", 379 "evidence": "Tables 6a-6b show ~20% mean latency increase for τ2-bench. Token costs actually decrease 3.3% on average because the natural language policy is removed from the system prompt. Table 7b shows MALADE overhead ($0.090 vs $0.071).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Linear message histories are insufficient for policy enforcement in multi-agent systems; dependency graphs are required", 384 "evidence": "§3 provides formal arguments. §5.4 demonstrates with DependsSameAgent predicate that per-session authorization requires causal context across agent boundaries. Conceptually motivated but the necessity is argued, not empirically tested against a linear-history alternative.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "PCAS improves MALADE prediction accuracy from 14/15 to 15/15 while eliminating all 42 policy violations", 389 "evidence": "Table 7a shows accuracy and compliance results. The 42 unauthorized FDA API accesses in non-instrumented runs are all blocked in instrumented runs.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Very small sample sizes", 396 "detail": "All evaluations use only 5 trials per condition. With such small N, the reported pass rates have wide confidence intervals (e.g., 3/5 has a 95% CI of roughly 12-77%). No statistical tests or power analysis justify this sample size." 397 }, 398 { 399 "flag": "Cherry-picked task selection", 400 "detail": "Only 3 tasks per domain (6 total) were selected from τ2-bench, chosen because they 'elicit a specific category of policy violation.' The telecom domain was omitted because performance 'is already saturated.' This selection may overstate PCAS's impact by focusing on tasks where enforcement is most helpful." 401 }, 402 { 403 "flag": "No ablation of system components", 404 "detail": "PCAS has multiple components (dependency graph, reference monitor, policy engine with Datalog) but no ablation tests whether the full dependency graph is needed vs simpler approaches like linear trace checking." 405 }, 406 { 407 "flag": "Self-evaluation bias", 408 "detail": "Authors evaluate their own system and wrote both the Datalog policies and the evaluation. No independent evaluation or third-party policy authoring is included." 409 }, 410 { 411 "flag": "Langroid affiliation undisclosed as conflict", 412 "detail": "One author is affiliated with Langroid, which appears to be an LLM agent company. This potential conflict of interest is not discussed." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "τ2-bench: Evaluating conversational agents in a dual-control environment", 418 "authors": ["Victor Barres", "Honghua Dong", "Soham Ray", "Xujie Si", "Karthik Narasimhan"], 419 "year": 2025, 420 "arxiv_id": "2506.07982", 421 "relevance": "Benchmark used for evaluating customer service agent policy compliance." 422 }, 423 { 424 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 425 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic"], 426 "year": 2024, 427 "relevance": "Dynamic benchmark for evaluating prompt injection attacks and defenses in LLM agent settings." 428 }, 429 { 430 "title": "Progent: Programmable privilege control for LLM agents", 431 "authors": ["Tianneng Shi", "Jingxuan He", "Zhun Wang"], 432 "year": 2025, 433 "arxiv_id": "2504.11703", 434 "relevance": "Domain-specific language for LLM agent privilege control; key baseline for runtime enforcement comparison." 435 }, 436 { 437 "title": "Defeating prompt injections by design", 438 "authors": ["Edoardo Debenedetti", "Ilia Shumailov"], 439 "year": 2025, 440 "arxiv_id": "2503.18813", 441 "relevance": "CaMeL architecture for separating data from instructions to defend against prompt injection." 442 }, 443 { 444 "title": "Securing AI agents with information-flow control", 445 "authors": ["Manuel Costa", "Boris Köpf", "Aashish Kolluri"], 446 "year": 2025, 447 "arxiv_id": "2505.23643", 448 "relevance": "FIDES system applying information-flow control for prompt injection defense; closest related work to PCAS." 449 }, 450 { 451 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 452 "authors": ["Kai Greshake", "Sahar Abdelnabi"], 453 "year": 2023, 454 "relevance": "Foundational work on indirect prompt injection attacks against LLM agents." 455 }, 456 { 457 "title": "ShieldAgent: Shielding agents via verifiable safety policy reasoning", 458 "authors": ["Zhaorun Chen", "Mintong Kang", "Bo Li"], 459 "year": 2025, 460 "relevance": "Agent safety via logical rules and Markov logic networks; baseline comparison for policy enforcement." 461 }, 462 { 463 "title": "AgentSpec: Customizable runtime enforcement for safe and reliable LLM agents", 464 "authors": ["Haoyu Wang", "Christopher M Poskitt", "Jun Sun"], 465 "year": 2025, 466 "arxiv_id": "2503.18666", 467 "relevance": "Lightweight DSL for runtime enforcement of LLM agent safety constraints." 468 }, 469 { 470 "title": "How not to detect prompt injections with an LLM", 471 "authors": ["Sarthak Choudhary", "Divyam Anshumaan", "Nils Palumbo", "Somesh Jha"], 472 "year": 2025, 473 "relevance": "Evaluation showing LLM-based prompt injection detectors can be bypassed by adaptive attacks." 474 }, 475 { 476 "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against LLM jailbreaks and prompt injections", 477 "authors": ["Milad Nasr", "Nicholas Carlini"], 478 "year": 2025, 479 "arxiv_id": "2510.09023", 480 "relevance": "Demonstrates that 12 published defenses against prompt injection can all be bypassed with >90% success." 481 }, 482 { 483 "title": "MALADE: Orchestration of LLM-powered agents with retrieval augmented generation for pharmacovigilance", 484 "authors": ["Jihye Choi", "Nils Palumbo", "Prasad Chalasani"], 485 "year": 2024, 486 "relevance": "Multi-agent pharmacovigilance system used as case study for PCAS evaluation." 487 }, 488 { 489 "title": "Systems security foundations for agentic computing", 490 "authors": ["Mihai Christodorescu", "Earlence Fernandes", "Somesh Jha"], 491 "year": 2025, 492 "arxiv_id": "2512.01295", 493 "relevance": "Foundational framework for security in agentic AI systems." 494 }, 495 { 496 "title": "Why do multi-agent LLM systems fail?", 497 "authors": ["Mert Cemri", "Melissa Z Pan"], 498 "year": 2025, 499 "arxiv_id": "2503.13657", 500 "relevance": "Analysis of failure modes in multi-agent LLM systems, motivating the need for policy enforcement." 501 }, 502 { 503 "title": "AI agents under threat: A survey of key security challenges and future pathways", 504 "authors": ["Zehang Deng", "Yongjian Guo"], 505 "year": 2025, 506 "relevance": "Survey of security challenges in AI agent systems including unauthorized actions and data exfiltration." 507 } 508 ] 509 }