scan-v4.json (31574B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks", 6 "authors": [ 7 "Yupei Liu", 8 "Yuqi Jia", 9 "Jinyuan Jia", 10 "Dawn Song", 11 "N. Gong" 12 ], 13 "year": 2025, 14 "venue": "IEEE Symposium on Security and Privacy", 15 "arxiv_id": "2504.11358", 16 "doi": "10.1109/SP61157.2025.00250" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims of effective detection on multiple benchmarks and LLMs are supported by Tables 1-6. The claim of outperforming baselines is supported by Table 3.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims about which components matter are supported by ablation studies (Section 5.3) and the DataSentinel (Min) variant comparison (Section 5.4), which isolate the game-theoretic component.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Section 6 explicitly bounds generalization: less effective when injected task = target task, discusses benign instructions limitation, and notes the defense may be less effective as LLMs improve (meta-review D.4).", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section 6 discusses alternative explanations: the detection LLM vs backend LLM context difference, why some false negatives still cause attacks, and comparison with StruQ/SecAlign as alternative defense approaches.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper measures FPR and FNR for prompt injection detection and frames claims at that exact granularity: 'DataSentinel achieves 0% FPR and at most 7% FNR' on specific benchmarks. It does not frame this as 'security' or 'safety' broadly — it stays within the detection accuracy construct.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 6 'Discussion and Limitations' provides substantive discussion of multiple limitations.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Section 6 discusses specific threats: less effective for same target/injected task type, benign instructions in data causing false positives, and potential weakness as LLMs improve at instruction following.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "Section 6 explicitly states DataSentinel is less effective for adversarial examples (same task type), cannot distinguish benign instructions from injections, and leaves detecting same-task attacks as future work.", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgments section lists NSF grants 2131859, 2125977, 2112562, 1937787 and ARO grant W911NF2110182.", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations clearly listed: Penn State, Duke University, UC Berkeley. No product being evaluated is from these institutions.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "Funding is from NSF and ARO (government agencies) which have no financial stake in the outcome of prompt injection detection research.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement is present in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are formally defined with mathematical notation: LLM-integrated application, target/injected task, contaminated target data, FPR, FNR, detection instruction, secret key, and the full threat model are all precisely specified in Sections 2-3.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Contributions are enumerated in three bullets: first game-theoretic detection method, minimax optimization formulation with gradient-based solver, and comprehensive evaluation across 9 attacks, 7 datasets, 6 LLMs, and 6 baselines.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 engages substantively with prior work on heuristic and optimization-based attacks, and detection vs prevention defenses; 6 baseline methods are implemented and compared directly.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code and data released at https://github.com/liu00222/Open-Prompt-Injection, stated in the abstract.", 125 "source": "opus" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper uses publicly available benchmark datasets (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) and states code and data are available at the GitHub link.", 131 "source": "opus" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Only model names and hyperparameters are listed.", 137 "source": "opus" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub link is provided but no README or reproduction guide is described in the paper.", 143 "source": "opus" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "Results are reported as point estimates (FPR and FNR values) without confidence intervals or error bars.", 151 "source": "opus" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "The paper claims DataSentinel 'significantly outperforms' baselines but provides no statistical significance tests — comparisons are based solely on comparing FPR/FNR numbers.", 157 "source": "opus" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "FPR and FNR values are reported with baselines for context (e.g., KAD FPR up to 0.10 vs DataSentinel 0.00; KAD FNR up to 0.21 vs DataSentinel at most 0.07), providing magnitude context.", 163 "source": "opus" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "100 data points per task are sampled from test sets without justification for why 100 is sufficient. No power analysis.", 169 "source": "opus" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper mentions fixing the random seed (Section 5.1) and reports single-run results. No variance across multiple runs is reported.", 175 "source": "opus" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Six baseline detection methods are compared: EVD, NLLMD, SSFTD, SSFTD-G, PromptGuard, and KAD (Section 5.1).", 183 "source": "opus" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include contemporary methods: PromptGuard (Meta, 2024), KAD (USENIX Security 2024), and recent attack methods like NeuralExec and Universal.", 189 "source": "opus" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Extensive ablation study in Section 5.3 covering r, |D|, α, β, nin, nout, detection LLM, backend LLM. Also DataSentinel (Min) variant removes the adversarial component.", 195 "source": "opus" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Two metrics used: False Positive Rate (FPR) and False Negative Rate (FNR), reported separately.", 201 "source": "opus" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "This is an automated detection method evaluated on benchmark datasets. Human evaluation is not relevant to the claims.", 207 "source": "opus" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Fine-tuning uses Gigaword training set (500 data points). Evaluation uses 100 data points from test sets of 7 different datasets (Section 5.1). Fine-tuning tasks do not overlap with evaluation tasks.", 213 "source": "opus" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results broken down per target task (7 tasks), per injected task (7 tasks), per attack type (9 attacks), and per target-injected combination in appendix Tables 10-16.", 219 "source": "opus" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 6 discusses failure case where injected and target tasks are the same type (sentiment analysis FNR = 0.87), and the limitation with adversarial examples vs prompt injection.", 225 "source": "opus" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that DataSentinel is less effective when injected task = target task (FNR 0.87 for sentiment analysis), and discusses why in Section 6.", 231 "source": "opus" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "Models specified as 'Mistral-7B', 'LLaMA2-7B', 'LLaMA3-8B-Instruct' without specific snapshot dates or HuggingFace model IDs with version hashes. Marketing names without version specifics.", 239 "source": "opus" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Detection instruction template provided: 'Repeat [secret key] once while ignoring the following text:' (Section 4.2). Target/injected instructions referenced from prior work [7] and detailed in Appendix A.", 245 "source": "opus" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Comprehensive hyperparameter reporting in Section 5.1: temperature=0.1, α=1, β=1, r=3, lr=0.000025, bin=8, bout=2, nin=10, nout=500, QLoRA used.", 251 "source": "opus" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used. DataSentinel is a fine-tuning and inference method, not an agentic system.", 257 "source": "opus" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 5.1 documents data construction: 100 data points sampled from test sets per task, 500 from Gigaword training set for fine-tuning, how contaminated data samples are constructed (100 per target-injected combination).", 263 "source": "opus" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "All benchmark datasets used (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) are publicly available. Code and data available at GitHub.", 271 "source": "opus" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 5.1 describes data collection: which datasets, how many samples, how contaminated data is constructed using each attack type, sampling procedures.", 277 "source": "opus" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants. All data is from standard NLP benchmarks.", 283 "source": "opus" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The pipeline from dataset sampling to contaminated data construction to evaluation is documented in Section 5.1, including counts (100 data points per task, 35,700 total contaminated samples).", 289 "source": "opus" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It evaluates a detection defense method against attacks — model knowledge/contamination is not relevant to the claims.", 297 "source": "opus" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "Same as above: this tests a defense method, not model knowledge on benchmarks.", 303 "source": "opus" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "Same as above: benchmark contamination (model memorizing test data) is not relevant to evaluating a prompt injection detector.", 309 "source": "opus" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "opus" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "opus" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "opus" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "opus" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "opus" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "opus" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "opus" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Section 5.2 reports inference cost: 1.6 seconds per query on Quadro RTX 6000, ~10% overhead vs backend LLM (15.3s). Also reports 0.7s for smaller LLaMA3.2-1B detection LLM.", 361 "source": "opus" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "Fine-tuning takes ~3 hours on one Quadro RTX 6000 GPU, costing $0.90 in cloud GPU rent (Section 5.2).", 367 "source": "opus" 368 } 369 }, 370 "experimental_rigor": { 371 "seed_sensitivity_reported": { 372 "applies": true, 373 "answer": false, 374 "justification": "The paper fixes a single random seed (Section 5.1: 'fix the seed for the random number generator') and reports single-run results. No multi-seed analysis.", 375 "source": "opus" 376 }, 377 "number_of_runs_stated": { 378 "applies": true, 379 "answer": false, 380 "justification": "No explicit statement of number of runs. Single fixed seed implies single run.", 381 "source": "opus" 382 }, 383 "hyperparameter_search_budget": { 384 "applies": true, 385 "answer": false, 386 "justification": "The ablation study (Section 5.3) varies hyperparameters one at a time but does not report a search budget or how the default values were selected.", 387 "source": "opus" 388 }, 389 "best_config_selection_justified": { 390 "applies": true, 391 "answer": false, 392 "justification": "Default hyperparameters (α=1, β=1, r=3, etc.) are stated but not justified beyond the ablation showing they work well. No validation set selection procedure described.", 393 "source": "opus" 394 }, 395 "multiple_comparison_correction": { 396 "applies": false, 397 "answer": false, 398 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.", 399 "source": "opus" 400 }, 401 "self_comparison_bias_addressed": { 402 "applies": true, 403 "answer": false, 404 "justification": "The authors implement their own baselines (EVD, NLLMD, SSFTD, SSFTD-G) and compare against their own system. No acknowledgment of self-comparison bias. PromptGuard and KAD use open-source code from others.", 405 "source": "opus" 406 }, 407 "compute_budget_vs_performance": { 408 "applies": true, 409 "answer": false, 410 "justification": "DataSentinel requires fine-tuning (3 hours GPU) while KAD does not, but performance is not reported as a function of matched compute budgets.", 411 "source": "opus" 412 }, 413 "benchmark_construct_validity": { 414 "applies": true, 415 "answer": false, 416 "justification": "The paper does not discuss whether the 7 NLP tasks and attack scenarios are representative of real-world prompt injection threats. No discussion of construct validity of the evaluation setup.", 417 "source": "opus" 418 }, 419 "scaffold_confound_addressed": { 420 "applies": false, 421 "answer": false, 422 "justification": "The paper evaluates a detection method, not model comparisons through different scaffolds. The fine-tuned detection LLM is evaluated directly on benchmark inputs. No scaffolding framework mediates between the model and the evaluation.", 423 "source": "opus" 424 } 425 }, 426 "data_leakage": { 427 "temporal_leakage_addressed": { 428 "applies": false, 429 "answer": false, 430 "justification": "This evaluates a defense method, not model knowledge on benchmarks. Temporal leakage of benchmark solutions is not relevant here.", 431 "source": "opus" 432 }, 433 "feature_leakage_addressed": { 434 "applies": true, 435 "answer": true, 436 "justification": "The paper explicitly separates the fine-tuning data (Gigaword training set, different instruction) from evaluation data (test sets of 7 tasks with different instructions), and notes the adaptive attacks during fine-tuning differ from evaluation attacks (Section 5.2).", 437 "source": "opus" 438 }, 439 "non_independence_addressed": { 440 "applies": true, 441 "answer": true, 442 "justification": "Section 5.2 explicitly states fine-tuning tasks (D) do not overlap with evaluation target/injected tasks, and the optimized separator during training differs from attack separators used in evaluation.", 443 "source": "opus" 444 }, 445 "leakage_detection_method": { 446 "applies": false, 447 "answer": false, 448 "justification": "Standard benchmark contamination detection is not relevant to this defense evaluation. The paper does address train-test separation through design.", 449 "source": "opus" 450 } 451 } 452 } 453 }, 454 "claims": [ 455 { 456 "claim": "DataSentinel achieves FPR close to 0 and FNR at most 0.07 for all existing prompt injection attacks", 457 "evidence": "Tables 1 and 2 show FPR=0.00 and FNR≤0.07 across 7 target tasks, 7 injected tasks, and 9 attacks (6 heuristic + 3 optimization-based)", 458 "supported": "strong" 459 }, 460 { 461 "claim": "DataSentinel significantly outperforms 6 baselines including the state-of-the-art known-answer detection", 462 "evidence": "Table 3 shows KAD has FPR up to 0.10 and FNR up to 0.21 under NeuralExec vs DataSentinel near-zero; PromptGuard achieves FNR=0 but FPR up to 1.00", 463 "supported": "strong" 464 }, 465 { 466 "claim": "Minimax game-theoretic fine-tuning is essential for robustness to adaptive attacks", 467 "evidence": "Table 6 shows DataSentinel (Min) reaches FNR=0.98 under Heuristic-based-II vs near-zero for DataSentinel (Minimax); KAD reaches FNR=0.93 under optimization-based adaptive attack", 468 "supported": "strong" 469 }, 470 { 471 "claim": "DataSentinel generalizes across different detection and backend LLMs", 472 "evidence": "Table 4 shows consistent near-zero FPR/FNR with Mistral-7B, LLaMA2-7B, LLaMA3-8B-Instruct; Table 5 shows cross-model detection remains effective", 473 "supported": "moderate" 474 }, 475 { 476 "claim": "DataSentinel remains effective against adaptive attacks except when injected task matches target task", 477 "evidence": "Table 6: FNR≤0.06 across most tasks under all adaptive attacks, but FNR=0.87 for sentiment vs sentiment (same task type reduces to adversarial examples)", 478 "supported": "strong" 479 }, 480 { 481 "claim": "DataSentinel's fine-tuning cost is low (~$0.90, 3 GPU-hours) with ~10% inference overhead", 482 "evidence": "Section 5.2 explicitly states 3 hours on Quadro RTX 6000, $0.90 cloud cost, and 1.6s query time vs 15.3s backend LLM", 483 "supported": "strong" 484 } 485 ], 486 "methodology_tags": [ 487 "benchmark-eval", 488 "theoretical" 489 ], 490 "key_findings": "DataSentinel fine-tunes a detection LLM via minimax optimization — making it intentionally more susceptible to prompt injection — to detect whether LLM-integrated application inputs are contaminated, achieving near-zero FPR and FNR (≤0.07) across 9 existing attacks and substantially outperforming 6 baselines. The game-theoretic adversarial training (minimax vs min-only) is shown to be critical for robustness against adaptive attacks, where the non-minimax variant fails badly. The approach has a principled failure mode when injected and target tasks are of the same type (reducing to adversarial examples, FNR=0.87), and incurs low computational cost (~$0.90 fine-tuning, ~10% inference overhead).", 491 "red_flags": [ 492 { 493 "flag": "No variance or CIs", 494 "detail": "All experiments use a fixed random seed; no variance, standard deviation, or confidence intervals are reported across runs, making it impossible to assess result stability or reliability of near-zero rate claims." 495 }, 496 { 497 "flag": "No significance tests", 498 "detail": "Comparative claims against 6 baselines are made without any statistical hypothesis testing; observed differences may be within noise given small sample sizes (100 per task)." 499 }, 500 { 501 "flag": "Open-source 7-8B models only", 502 "detail": "All experiments use Mistral-7B and LLaMA variants; generalization to larger models or closed-source LLMs (GPT-4, Claude, Gemini) — the most widely deployed — is not evaluated." 503 }, 504 { 505 "flag": "Narrow NLP task scope", 506 "detail": "Evaluation limited to 7 simple classification/generation NLP tasks; does not test in complex agentic settings (tool use, web browsing, multi-step tasks) where prompt injection is most dangerous in practice." 507 }, 508 { 509 "flag": "Small evaluation sample", 510 "detail": "Only 100 test samples per task combination are used with no power analysis, which limits statistical confidence for near-zero FPR/FNR claims." 511 } 512 ], 513 "cited_papers": [ 514 { 515 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 516 "relevance": "Liu et al. (USENIX Security 2024) — key baseline providing known-answer detection and benchmark that DataSentinel builds upon and compares against" 517 }, 518 { 519 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 520 "relevance": "Greshake et al. (AISec 2023) — foundational paper on indirect prompt injection attacks against real-world LLM applications" 521 }, 522 { 523 "title": "Universal and transferable adversarial attacks on aligned language models", 524 "relevance": "Zou et al. (2023) — introduces GCG method used by DataSentinel to solve the inner max problem during minimax training" 525 }, 526 { 527 "title": "StruQ: Defending against prompt injection with structured queries", 528 "relevance": "Chen et al. (USENIX Security 2025) — prevention-based defense used in complementary comparison to motivate DataSentinel's detection approach" 529 }, 530 { 531 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 532 "relevance": "Pasquini et al. (2024) — NeuralExec is the default optimization-based attack in DataSentinel's evaluation and one of three optimization-based attacks tested" 533 }, 534 { 535 "title": "Automatic and universal prompt injection attacks against large language models", 536 "relevance": "Liu et al. (2024) — Universal attack used as one of three optimization-based attacks in the evaluation" 537 }, 538 { 539 "title": "PLeak: Prompt leaking attacks against large language model applications", 540 "relevance": "Hui et al. (CCS 2024) — PLeak evaluated as a specific attack targeting instruction stealing rather than task hijacking" 541 }, 542 { 543 "title": "SecAlign: Defending against prompt injection with preference optimization", 544 "relevance": "Chen et al. (2024) — prevention-based defense used in Section 6 experiments to contextualize DataSentinel's role in defense-in-depth" 545 } 546 ], 547 "engagement_factors": { 548 "practical_relevance": { 549 "score": 2, 550 "justification": "Open-source tool with code available that developers building LLM-integrated applications could deploy to detect prompt injection attacks." 551 }, 552 "surprise_contrarian": { 553 "score": 1, 554 "justification": "The insight of deliberately making a detection LLM more vulnerable to turn weakness into defense signal is clever but not deeply counterintuitive." 555 }, 556 "fear_safety": { 557 "score": 2, 558 "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper systematically demonstrates attack vectors and detection gaps." 559 }, 560 "drama_conflict": { 561 "score": 1, 562 "justification": "Mildly challenges existing detection approaches like Meta's PromptGuard (shown to flag nearly everything) but doesn't target a specific company's claims." 563 }, 564 "demo_ability": { 565 "score": 1, 566 "justification": "Code is on GitHub but requires GPU access, fine-tuning setup, and open-source LLMs — not a quick-try experience." 567 }, 568 "brand_recognition": { 569 "score": 1, 570 "justification": "Authors from Duke, Penn State, and UC Berkeley (Dawn Song) are well-known in security research but not household names in broader tech." 571 } 572 }, 573 "hn_data": { 574 "threads": [ 575 { 576 "hn_id": "40115482", 577 "title": "Survey Study on AI Agent Architectures (2024)", 578 "points": 77, 579 "comments": 16, 580 "url": "https://news.ycombinator.com/item?id=40115482", 581 "created_at": "2024-04-22T15:47:47Z" 582 }, 583 { 584 "hn_id": "44585492", 585 "title": "How Many Instruction Can LLMs Follow at Once?", 586 "points": 11, 587 "comments": 0, 588 "url": "https://news.ycombinator.com/item?id=44585492", 589 "created_at": "2025-07-16T18:38:36Z" 590 }, 591 { 592 "hn_id": "23442899", 593 "title": "Scientists demonstrate particle detector for dark matter", 594 "points": 6, 595 "comments": 2, 596 "url": "https://news.ycombinator.com/item?id=23442899", 597 "created_at": "2020-06-06T22:33:57Z" 598 }, 599 { 600 "hn_id": "45482380", 601 "title": "Acoustic Eavesdropping via Mouse Sensors", 602 "points": 4, 603 "comments": 0, 604 "url": "https://news.ycombinator.com/item?id=45482380", 605 "created_at": "2025-10-05T15:40:37Z" 606 }, 607 { 608 "hn_id": "35695104", 609 "title": "Emergent and Predictable Memorization in Large Language Models", 610 "points": 3, 611 "comments": 0, 612 "url": "https://news.ycombinator.com/item?id=35695104", 613 "created_at": "2023-04-25T00:31:12Z" 614 }, 615 { 616 "hn_id": "45461534", 617 "title": "Comparing Quantum Annealing and BF-DCQO", 618 "points": 2, 619 "comments": 0, 620 "url": "https://news.ycombinator.com/item?id=45461534", 621 "created_at": "2025-10-03T11:13:53Z" 622 }, 623 { 624 "hn_id": "40106947", 625 "title": "From r to Q∗: Your Language Model is a Q-Function", 626 "points": 2, 627 "comments": 0, 628 "url": "https://news.ycombinator.com/item?id=40106947", 629 "created_at": "2024-04-21T16:22:09Z" 630 }, 631 { 632 "hn_id": "23416215", 633 "title": "Sensei: Direct-Detection Results on Sub-GeV Dark Matter from a New Skipper-CCD", 634 "points": 2, 635 "comments": 0, 636 "url": "https://news.ycombinator.com/item?id=23416215", 637 "created_at": "2020-06-04T13:23:14Z" 638 }, 639 { 640 "hn_id": "44191952", 641 "title": "Questioning Representational Optimism in Deep Learning", 642 "points": 1, 643 "comments": 3, 644 "url": "https://news.ycombinator.com/item?id=44191952", 645 "created_at": "2025-06-05T14:17:23Z" 646 }, 647 { 648 "hn_id": "45934130", 649 "title": "Questioning Representational Optimism in Deep Learning", 650 "points": 1, 651 "comments": 1, 652 "url": "https://news.ycombinator.com/item?id=45934130", 653 "created_at": "2025-11-15T01:07:24Z" 654 } 655 ], 656 "top_points": 77, 657 "total_points": 109, 658 "total_comments": 22 659 } 660 }