scan-v5.json (31901B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Malice in Agentland: Down the Rabbit Hole of Backdoors in the AI Supply Chain", 6 "authors": [ 7 "Léo Boisvert", 8 "Abhay Puri", 9 "Chandra Kiran Reddy Evuru", 10 "Nicolas Chapados", 11 "Quentin Cappart", 12 "Alexandre Lacoste", 13 "Krishnamurthy (DJ) Dvijotham", 14 "Alexandre Drouin" 15 ], 16 "year": 2025, 17 "venue": "arXiv.org", 18 "arxiv_id": "2510.05159", 19 "doi": "10.48550/arXiv.2510.05159" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract's key quantitative claims (2% poisoning → 80%+ ASR, defenses failing across all three threat models) are directly supported by Tables 2, 3, and the Watch the Weights analysis in Section 5.3.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Controlled experiments vary poison rate as the single manipulated variable while holding other factors constant, and they include clean baseline conditions, which is adequate for causal inference about data poisoning causing backdoor implantation.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper claims the vulnerability holds across 'the agentic AI supply chain' broadly, but experiments are limited to two benchmarks (τ-bench retail/airline, WebArena-Lite) and three model sizes; broader generalization to other agent architectures or tasks is asserted but not demonstrated.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper does not consider alternative explanations for defense failures beyond their stated reasoning; for example, they do not explore whether better-calibrated guardrail models or different detection thresholds could work, nor whether the benchmarks' specific structure makes attacks easier than realistic deployments.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "TSR (task success rate) and ASR (attack success rate) are clearly defined and directly measure the constructs claimed—benign utility and backdoor activation respectively—with no conflation between proxy and target.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": false, 58 "justification": "There is no dedicated limitations or threats-to-validity section; the paper has a Discussion/Conclusion section and future work directions but these do not constitute a formal limitations analysis.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": false, 64 "justification": "No specific threats to validity are discussed; the paper does not acknowledge that results on two synthetic benchmarks with small test sets (25–165 tasks) may not generalize, or that the controlled poisoning scenarios may not reflect realistic attacker constraints.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper does not explicitly state what results do NOT show; for instance, it does not clarify that results are limited to SFT fine-tuning pipelines, that RLHF-trained models are untested, or that real enterprise deployments may have additional safeguards not evaluated here.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "The acknowledgments thank ServiceNow colleagues for compute resources but there is no formal funding disclosure statement; the primary affiliation (ServiceNow Research) suggests institutional support without explicit declaration.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations are fully disclosed in the paper header: ServiceNow Research, Mila - Québec AI Institute, Polytechnique Montréal, and Université Laval.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": false, 90 "justification": "ServiceNow is a major AI agent platform provider with a business interest in AI agent security awareness; six of eight authors are ServiceNow employees, creating a non-independent relationship between primary funder/employer and the domain being studied.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "There is no competing interests statement or financial interests declaration in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Key terms are precisely defined: 'agent' is formalized as a policy π mapping observations to action distributions, 'backdoor' and 'trigger' are formally defined mathematically, and ASR/TSR metrics are explicitly defined in Section 4.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper explicitly lists four numbered contributions in Section 1: a threat model taxonomy, empirical demonstration across two benchmarks, evaluation of defenses, and a call for new strategies.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 2 provides substantive engagement with three streams of prior work (inference-time attacks, non-agentic data poisoning, backdooring attacks), explicitly differentiating this work as the first to situate backdoor attacks within the agentic supply chain context.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "No code release is mentioned in the paper; references to DoomArena are to a separate arxiv preprint, and the paper itself provides no repository link or code release.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "Evaluation benchmarks (τ-bench and WebArena-Lite) are publicly available, and the NNetNav-WA fine-tuning dataset used is from a public prior work; however, the poisoned training datasets generated by the authors are not released.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section C provides detailed environment specifications: 8×A100 80GB GPUs, DeepSpeed ZeRO-2/3, Flash Attention 2, LLaMA-Factory framework, batch sizes, learning rates, context lengths, and precision settings.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "Hyperparameters are documented in Section C and attack examples in appendices, but there are no step-by-step reproduction instructions; the code pipeline for generating poisoned data, running fine-tuning, and evaluating attacks is not provided in executable form.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": true, 153 "justification": "Standard deviations are reported for all main results (TSR ± STD, ASR ± STD) across all tables, with 3 trials for τ-bench and 2 trials for WebArena.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "No formal statistical significance tests are used; the paper relies on descriptive statistics and large effect sizes (ASR near 0% vs 100%) rather than formal hypothesis testing for comparative claims.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Absolute performance values with baselines are reported throughout (e.g., ASR jumping from 0% to 100%, TSR increasing from 22.61% zero-shot to 39-41% fine-tuned), making effect sizes implicit and interpretable.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The choice of 3 trials for τ-bench and 2 for WebArena is justified only by 'higher computational cost,' with no power analysis or justification for why 2-3 trials are sufficient given the observed variance.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": true, 177 "justification": "Standard deviations are consistently reported for all TSR and ASR measurements across all experimental conditions in Tables 2-12.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Zero-shot baselines and clean fine-tuning baselines (0% poison) are included for all experiments, providing clear reference points for measuring attack effectiveness.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "Defense baselines include contemporary state-of-the-art guardrail models (Llama-Firewall from 2025, Granite Guardian 3.3-8B from 2025) and Watch the Weights (2025), all current at the time of publication.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Figure 3 systematically varies poison rate ρ across multiple values (5%, 25%, 50%) to isolate the effect of poisoning intensity, and TM3 varies the number of clean fine-tuning steps to study backdoor persistence.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Two complementary metrics are used throughout: TSR (task success rate measuring benign utility) and ASR (attack success rate measuring backdoor effectiveness), as well as ASR- (false activation rate).", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": false, 208 "answer": false, 209 "justification": "Human evaluation is not applicable; the study evaluates automated agent behavior and attack/defense metrics that can be assessed computationally through benchmark task success and backdoor activation.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "All evaluations use held-out test sets distinct from training data: τ-bench retail test domain (115 tasks), τ-bench airline domain (25 tasks for TM3), and WebArena-Lite (165 tasks).", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Results are broken down by threat model (TM1/2/3), benchmark (τ-bench vs WebArena), model size (3B vs 7B), and defense type, providing granular analysis across multiple dimensions.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": true, 227 "justification": "The Qwen 50% poison rate ASR decline in WebArena (Table 5 appendix) is explicitly analyzed as a failure case attributed to the model selection criterion, and Watch the Weights' 97-100% FPR is discussed as a failure mode.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "Defense failures are extensively reported as negative results (all three tested defenses failing), and the cross-domain backdoor transfer degradation (Table 8) shows partial failure of attack at low poison rates.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": true, 241 "justification": "Exact model versions are specified throughout: Qwen-2.5-3B-Instruct, Qwen-2.5-7B-Instruct, Llama-3.1-8B-Instruct, Qwen-2.5-72B-Instruct (teacher), GPT-4o (simulated user), and GPT-5 variants for judge experiments.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": true, 247 "justification": "Complete prompt examples are provided: web agent system prompt (Section B.1), LLM-as-judge prompts (Section D), and the full prompt injection strings used for TM2 (Sections E.1.2 and E.2.2).", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Section C provides comprehensive hyperparameters: learning rates (1e-5, 1e-6), batch sizes, context lengths (16,384 and 20,000 tokens), epochs, warmup ratios, gradient norms, and LoRA rank settings.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "Agent scaffolding is described in detail: the NNetNavBrowserGymAgent with enhanced API call action set for WebArena (Section B.1), and the tool-calling agent architecture with GPT-4o as simulated user for τ-bench.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "Data preprocessing is documented: how poisoned traces are created (trigger injection via hidden div elements, malicious tool call insertion), how clean traces are filtered (reward=1 only), and how training/validation splits are made (90/10).", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "The poisoned training datasets generated by the authors are not released; only the public benchmark sources are available, and the specific poisoned traces used in experiments cannot be independently verified.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "Data collection procedures are described in detail: Qwen-2.5-72B as teacher with GPT-4o as simulated user, 10 independent trials retaining successful trajectories (reward=1), yielding 4,000 samples split 90/10.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants; all data is generated synthetically using LLMs as teacher agents and simulated users in automated benchmark environments.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": true, 291 "justification": "The full pipeline is documented: base model selection → data collection (teacher model in environment) → poisoning injection → fine-tuning (SFT with documented hyperparameters) → evaluation (TSR/ASR on held-out test sets).", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The training data cutoffs for Qwen-2.5 and Llama-3.1 base models are not stated; the paper does not address whether the base models' pre-training data included WebArena or τ-bench task descriptions.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "Potential overlap between base model pre-training data and the τ-bench/WebArena test tasks is not discussed; this could confound the zero-shot baseline performance and the interpretation of fine-tuning improvements.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper does not address whether WebArena or τ-bench examples were available before the Qwen-2.5 or Llama-3.1 training cutoffs, which could affect the validity of the zero-shot baselines used as attack comparisons.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants in this study; all experiments use automated AI agents and LLM-generated synthetic interactions.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants; IRB/ethics approval is not applicable. The paper has an ethics statement discussing responsible disclosure but no IRB process.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants; multiple trials were run but randomization procedures for trial design are not described.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "Inference costs (API calls to GPT-4o as simulated user, GPT-5 as judge) are not reported; only training compute is described.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": true, 369 "justification": "Training compute is reported: 8×A100 80GB GPUs for 5-6 hours (τ-bench TM1/2) and 15-20 hours (WebArena), giving a concrete sense of the computational investment required.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "Poisoning as few as 2% of fine-tuning traces embeds a backdoor with over 80% attack success rate, while preserving or improving benign task performance", 378 "evidence": "Table 3: WebArena TM2 at 2.3% poison rate achieves 91.65% ASR with TSR of 16.27% (vs 0.6% baseline); τ-bench TM2 at 5% achieves 100% ASR with TSR 43.77% (vs 22.61% baseline)", 379 "supported": "strong" 380 }, 381 { 382 "claim": "State-of-the-art guardrail models (Llama-Firewall, Granite Guardian) fail to detect backdoored behavior at both data-screening and evaluation-time stages", 383 "evidence": "Tables 11-12: TP rates for data screening are 0-7% with FP rates at similar levels; Tables 2-3 show ASR remains near 100% with guardrails active at evaluation time", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Backdoors survive extensive clean fine-tuning (TM3): ASR remains above 90% in τ-bench and near 100% in WebArena after thousands of clean training steps", 388 "evidence": "Figure 4 shows ASR persistence for Qwen-2.5-3B and Llama-3.1-8B across clean fine-tuning checkpoints; Table 10 confirms ASR=100% after 1,000 clean LoRA samples", 389 "supported": "strong" 390 }, 391 { 392 "claim": "The Watch the Weights weight-based defense achieves 97-100% false positive rate, rendering it practically unusable in realistic deployment scenarios", 393 "evidence": "Section 5.3 explicitly reports 97% FPR on τ-bench and 100% FPR on WebArena, attributed to domain mismatch between calibration data and deployment context", 394 "supported": "strong" 395 }, 396 { 397 "claim": "LLM-as-judge defenses work effectively in WebArena (up to 100% TP, 0% FP) but fail in τ-bench due to lack of task context", 398 "evidence": "Table 4: GPT-5 mini achieves 100% TP / 0% FP on WebArena but 0% TP / 89% FP on τ-bench; GPT-5 nano achieves 87.5% TP but 59% FP on τ-bench", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "Backdoored models outperform clean zero-shot baselines on benign tasks, creating a perverse incentive structure that masks compromise", 403 "evidence": "Across all tables, fine-tuned backdoored models consistently improve TSR over zero-shot baselines (e.g., τ-bench: 22.61% → 39-43%; WebArena: 0.6% → 14-16%), even at high poison rates", 404 "supported": "strong" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval", 409 "case-study" 410 ], 411 "key_findings": "AI agent supply chains are highly vulnerable to trigger-based backdoor attacks across three distinct attack vectors (direct data poisoning, environmental poisoning, and backdoored base models), all achievable with minimal poisoning rates (2-5%). A critical and counterintuitive finding is that backdoored models simultaneously improve on benign task performance compared to clean baselines, making the compromise nearly undetectable through standard performance monitoring. All evaluated state-of-the-art defenses—two guardrail models and one weight-based detector—fail to reliably detect or prevent these attacks, with LLM-as-judge approaches only partially effective in one of two benchmark settings. Backdoors persist through extensive clean fine-tuning, including LoRA, posing a severe supply chain risk for organizations downloading and fine-tuning open-weight models.", 412 "red_flags": [ 413 { 414 "flag": "No limitations section", 415 "detail": "The paper has no dedicated limitations or threats-to-validity section; the absence is notable given the small test sets (as few as 25 tasks for TM3 airline evaluation) and only two benchmark environments tested." 416 }, 417 { 418 "flag": "No funding disclosure", 419 "detail": "No formal funding statement appears despite most authors being ServiceNow employees and ServiceNow being a direct commercial participant in the AI agent market the paper discusses." 420 }, 421 { 422 "flag": "Watch the Weights evaluated with domain mismatch", 423 "detail": "The Watch the Weights defense is calibrated on one domain (retail/NNetNav) and evaluated on another (airline/WebArena), which the authors acknowledge causes the 97-100% FPR—but they present this as a defense failure rather than an unfair evaluation condition." 424 }, 425 { 426 "flag": "Small TM3 evaluation set", 427 "detail": "TM3 (backdoored base model) is evaluated on only 25 airline test tasks in τ-bench across 3 trials, providing limited statistical power for claims about backdoor persistence." 428 }, 429 { 430 "flag": "No code or poisoned data released", 431 "detail": "The poisoned training datasets and attack pipeline code are not released, making independent verification of the attack effectiveness impossible without recreating the entire experimental setup." 432 }, 433 { 434 "flag": "Overbroad generalization", 435 "detail": "Conclusions are framed as applying to 'the agentic AI supply chain' broadly, but experiments cover only SFT-based fine-tuning pipelines on two benchmarks; RLHF-trained models, other agent architectures, and other task domains are not tested." 436 }, 437 { 438 "flag": "No contamination discussion", 439 "detail": "The paper does not address whether Qwen-2.5 or Llama-3.1 pre-training data included WebArena or τ-bench examples, which could confound baseline performance and the interpretation of fine-tuning gains." 440 } 441 ], 442 "cited_papers": [ 443 { 444 "title": "τ-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 445 "relevance": "Primary evaluation benchmark; the retail and airline environments are used across all three threat models" 446 }, 447 { 448 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 449 "relevance": "Second primary evaluation benchmark; WebArena-Lite subset used for web agent backdoor experiments" 450 }, 451 { 452 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 453 "relevance": "Foundational prior work on backdoor persistence in LLMs; TM3 directly tests whether their findings extend to agentic fine-tuning settings" 454 }, 455 { 456 "title": "DoomArena: A Framework for Testing AI Agents Against Evolving Security Threats", 457 "relevance": "Framework used for trigger injection in WebArena experiments; authored by some of the same team" 458 }, 459 { 460 "title": "NNetNav: Unsupervised Learning of Browser Agents Through Environment Interaction in the Wild", 461 "relevance": "Source of WebArena fine-tuning dataset (NNetNav-WA) and TM2 data collection methodology" 462 }, 463 { 464 "title": "LlamaFirewall: An Open Source Guardrail System for Building Secure AI Agents", 465 "relevance": "One of two guardrail defenses evaluated; found ineffective at both data screening and evaluation-time defense" 466 }, 467 { 468 "title": "Watch the Weights: Unsupervised Monitoring and Control of Fine-Tuned LLMs", 469 "relevance": "Weight-based defense evaluated for TM3; found to have near-100% false positive rate in realistic deployment scenarios" 470 }, 471 { 472 "title": "AgentPoison: Red-Teaming LLM Agents via Poisoning Memory or Knowledge Bases", 473 "relevance": "Related backdoor attack work against LLM agents; establishes prior art for the attack vector studied" 474 }, 475 { 476 "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage", 477 "relevance": "Prior work on environmental injection attacks; TM2 extends this concept to the training/data collection phase rather than deployment" 478 }, 479 { 480 "title": "Poisoning Web-Scale Training Datasets is Practical", 481 "relevance": "Establishes feasibility of data poisoning attacks at scale; motivates the threat model for TM1" 482 } 483 ], 484 "engagement_factors": { 485 "practical_relevance": { 486 "score": 3, 487 "justification": "Directly threatens any organization using open-weight models or third-party fine-tuning data for AI agents, with specific named enterprise platforms (Microsoft Copilot, ServiceNow, Salesforce Agentforce) called out." 488 }, 489 "surprise_contrarian": { 490 "score": 2, 491 "justification": "The finding that backdoored models outperform clean baselines on benign tasks—making detection harder the more capable the model—is counterintuitive and challenges the assumption that better-performing models are safer." 492 }, 493 "fear_safety": { 494 "score": 3, 495 "justification": "Explicitly demonstrates that AI agents can be covertly turned into data exfiltration tools, with concrete success rates and scenarios involving user confidential information leakage." 496 }, 497 "drama_conflict": { 498 "score": 2, 499 "justification": "Frames the threat against named commercial AI agent platforms and draws parallels to SolarWinds and xz-utils supply chain attacks, creating a compelling adversarial narrative." 500 }, 501 "demo_ability": { 502 "score": 1, 503 "justification": "Uses public benchmarks (τ-bench, WebArena) that practitioners could access, but no code is released and the full experimental pipeline would require significant reproduction effort." 504 }, 505 "brand_recognition": { 506 "score": 1, 507 "justification": "ServiceNow Research and Mila have moderate recognition in the ML community but are not top-tier lab brands (OpenAI, Google DeepMind, Anthropic) that drive outsized attention." 508 } 509 }, 510 "hn_data": { 511 "threads": [ 512 { 513 "hn_id": "33227427", 514 "title": "Neural Networks Are Decision Trees", 515 "points": 34, 516 "comments": 9, 517 "url": "https://news.ycombinator.com/item?id=33227427", 518 "created_at": "2022-10-16T21:43:27Z" 519 }, 520 { 521 "hn_id": "33232042", 522 "title": "Neural Networks Are Decision Trees", 523 "points": 4, 524 "comments": 2, 525 "url": "https://news.ycombinator.com/item?id=33232042", 526 "created_at": "2022-10-17T11:18:16Z" 527 }, 528 { 529 "hn_id": "33200244", 530 "title": "Neural Networks Are Decision Trees", 531 "points": 4, 532 "comments": 0, 533 "url": "https://news.ycombinator.com/item?id=33200244", 534 "created_at": "2022-10-14T06:28:28Z" 535 }, 536 { 537 "hn_id": "42911219", 538 "title": "High-resolution imaging of radio source associated with Dyson Sphere Candidate G", 539 "points": 3, 540 "comments": 0, 541 "url": "https://news.ycombinator.com/item?id=42911219", 542 "created_at": "2025-02-02T19:53:54Z" 543 }, 544 { 545 "hn_id": "33192776", 546 "title": "Neural Networks Are Decision Trees", 547 "points": 3, 548 "comments": 0, 549 "url": "https://news.ycombinator.com/item?id=33192776", 550 "created_at": "2022-10-13T15:57:00Z" 551 }, 552 { 553 "hn_id": "41741744", 554 "title": "Mitigating Memorization in Language Models", 555 "points": 2, 556 "comments": 0, 557 "url": "https://news.ycombinator.com/item?id=41741744", 558 "created_at": "2024-10-04T14:23:05Z" 559 }, 560 { 561 "hn_id": "33303465", 562 "title": "Grounded Language Model Reasoning Through Simulation", 563 "points": 2, 564 "comments": 0, 565 "url": "https://news.ycombinator.com/item?id=33303465", 566 "created_at": "2022-10-23T00:33:43Z" 567 }, 568 { 569 "hn_id": "45538593", 570 "title": "New paper: A single character can make or break your LLM evals", 571 "points": 1, 572 "comments": 1, 573 "url": "https://news.ycombinator.com/item?id=45538593", 574 "created_at": "2025-10-10T13:09:51Z" 575 }, 576 { 577 "hn_id": "46151267", 578 "title": "Generative Graph Vocabularies for Robust Graph Foundation Models Fine-Tuning", 579 "points": 1, 580 "comments": 0, 581 "url": "https://news.ycombinator.com/item?id=46151267", 582 "created_at": "2025-12-04T18:46:47Z" 583 }, 584 { 585 "hn_id": "42021531", 586 "title": "Understanding Warmup-Stable-Decay Learning Rates", 587 "points": 1, 588 "comments": 0, 589 "url": "https://news.ycombinator.com/item?id=42021531", 590 "created_at": "2024-11-01T21:02:29Z" 591 } 592 ], 593 "top_points": 34, 594 "total_points": 55, 595 "total_comments": 12 596 } 597 }