scan.json (23947B)
1 { 2 "paper": { 3 "title": "Detecting Silent Failures in Multi-Agentic AI Trajectories", 4 "authors": [ 5 "Divya Pathak", 6 "Harshit Kumar", 7 "Anuska Roy", 8 "Felix George", 9 "Mudit Verma", 10 "Pratibha Moogi" 11 ], 12 "year": 2025, 13 "venue": "arXiv preprint", 14 "arxiv_id": "2511.04032", 15 "doi": "10.48550/arXiv.2511.04032" 16 }, 17 "scan_version": 3, 18 "active_modules": [ 19 "experimental_rigor", 20 "data_leakage" 21 ], 22 "methodology_tags": [ 23 "benchmark-eval" 24 ], 25 "key_findings": "The paper introduces anomaly detection for multi-agentic AI system trajectories, curating two benchmark datasets (4,275 and 894 traces) from Stock Market and Research Writing assistant systems. XGBoost achieves up to 98% accuracy in supervised settings, while semi-supervised SVDD reaches 96%, suggesting labeled data may not be necessary. Error analysis reveals that subtle drift anomalies without cycles or errors remain the hardest failure type to detect.", 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "Footnote 1 states 'The dataset and curation pipeline will be released after paper acceptance in accordance with organizational policies.' No code is currently available." 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": false, 36 "justification": "Same footnote: datasets planned for release after acceptance, not currently available." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "No environment specifications, dependency lists, or hardware details are provided." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "No reproduction instructions or scripts are provided." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "Table 2 reports only point estimates for accuracy, F1, precision, and recall with no confidence intervals or error bars." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper claims XGBoost outperforms other models based on comparing numbers in Table 2, with no statistical significance tests." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "Performance differences are reported with baseline context, e.g., 'Recall (97.03%) is slightly lower (-0.46%) than SVM (97.49%)' and similar comparisons throughout Section 3.3." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "No justification for dataset sizes (4,275 and 894 traces). The number of input prompts (525 and 112) is stated but not justified." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run point estimates." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "Multiple baselines are compared: supervised (XGBoost, Random Forest, LR, SVM, Naive Bayes), semi-supervised (SVDD, Isolation Forest), and unsupervised (K-Means)." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "The baselines are standard ML methods appropriate for the task. The paper notes that 'established techniques for AI Agents remain unexplored,' so there are no prior agent-specific baselines to compare against." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": false, 90 "justification": "No ablation study is performed on the feature set or pipeline components. The paper extracts 16 features but does not test subsets." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "Table 2 reports accuracy, macro-F1, precision, and recall for each model." 96 }, 97 "human_evaluation": { 98 "applies": true, 99 "answer": true, 100 "justification": "Two human annotators labeled the traces with inter-annotator agreement measured via Cohen's kappa (97.6% and 80.6%). This serves as human evaluation ground truth." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 3.2: 'split into 70-15-15% for train-validation-test' with hyperparameters tuned on validation set." 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Results are broken down by model type (supervised/semi-supervised/unsupervised) and across both datasets. SHAP feature importance analysis provides per-feature breakdowns." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 3.3 provides detailed error analysis of false negatives, with three specific insights about why misclassifications occur (subtle drift resembling normal traces)." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": true, 120 "justification": "K-Means unsupervised performance is reported as moderate/weak, and the paper explicitly states 'there remains significant room for improvement in the unsupervised case.'" 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "Abstract claims of 98% and 96% accuracy for XGBoost and SVDD are supported by Table 2 (Stock Market dataset: 98.03% and 96.47%)." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper attributes detection performance to model capabilities without controlling for confounds. No causal claims are explicitly made, but the error analysis implies path-level features 'cause' better detection without rigorous causal design." 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": false, 137 "justification": "The abstract frames this as a study of 'Multi-Agentic AI systems' generally, but results are from only two specific applications (Stock Market and Research Writing assistants). The paper does note the pipeline 'can be readily extended' but does not bound generalization claims." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": false, 142 "justification": "No discussion of alternative explanations for the results, such as whether the high accuracy is due to the anomalies being too easy (most involve cycles/errors with obvious feature signatures)." 143 }, 144 "proxy_outcome_distinction": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper equates detecting anomalous traces (the proxy) with detecting 'silent failures' (the claimed outcome) without discussing the gap. Many silent failures may not manifest as anomalous feature vectors, and not all anomalous traces represent meaningful failures." 148 } 149 }, 150 "setup_transparency": { 151 "model_versions_specified": { 152 "applies": true, 153 "answer": false, 154 "justification": "LLMs mentioned are 'gpt-4o, ibm-granite-3-1-8B, meta-llama-3-3-70B' — gpt-4o lacks a version/snapshot date. Granite and Llama include size but not exact version identifiers." 155 }, 156 "prompts_provided": { 157 "applies": true, 158 "answer": false, 159 "justification": "System prompts are categorized as 'poor', 'good', and 'strict' with brief descriptions, but the actual prompt text is not provided." 160 }, 161 "hyperparameters_reported": { 162 "applies": true, 163 "answer": false, 164 "justification": "Section 3.2 states 'Hyperparameters were tuned via grid search on the validation set' but does not report the actual hyperparameter values used." 165 }, 166 "scaffolding_described": { 167 "applies": true, 168 "answer": true, 169 "justification": "The multi-agent systems are described: Stock Market has 3 agents and 9 tools with named tools; Research Writing has 9 agents and 6 tools with named roles. The agentic trace structure is diagrammed in Figure 1." 170 }, 171 "data_preprocessing_documented": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 2 describes the full pipeline: trace collection via OpenTelemetry, feature extraction (16 features in 5 categories), and labeling criteria with inter-annotator agreement." 175 } 176 }, 177 "limitations_and_scope": { 178 "limitations_section_present": { 179 "applies": true, 180 "answer": false, 181 "justification": "No dedicated limitations section. Section 4 (Conclusions and Future Plans) mentions areas for improvement but does not substantively discuss limitations." 182 }, 183 "threats_to_validity_specific": { 184 "applies": true, 185 "answer": false, 186 "justification": "No threats to validity are discussed. Future work mentions are about expanding methods, not about threats to the current study's validity." 187 }, 188 "scope_boundaries_stated": { 189 "applies": true, 190 "answer": false, 191 "justification": "No explicit scope boundaries are stated. The paper does not clarify what its results do NOT show or what settings are excluded." 192 } 193 }, 194 "data_integrity": { 195 "raw_data_available": { 196 "applies": true, 197 "answer": false, 198 "justification": "Raw data is not available. Planned for release after acceptance." 199 }, 200 "data_collection_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 2.1 describes trace collection via OpenTelemetry instrumentation, with three factors varied (input query, LLM model, system prompt type). Specific numbers of prompts and models are provided." 204 }, 205 "recruitment_methods_described": { 206 "applies": false, 207 "answer": false, 208 "justification": "No human participants — data consists of synthetically generated agentic traces, not human-collected data. The annotators labeled traces but were not study participants." 209 }, 210 "data_pipeline_documented": { 211 "applies": true, 212 "answer": true, 213 "justification": "Figure 1 and Section 2 document the full pipeline: trace collection → feature extraction (16 features) → labeling. Numbers at each stage are provided (e.g., 525 prompts × 3 prompt types × 3 LLMs = pipeline producing 4,275 datapoints)." 214 } 215 }, 216 "conflicts_of_interest": { 217 "funding_disclosed": { 218 "applies": true, 219 "answer": false, 220 "justification": "No funding information is disclosed. Five of six authors are from IBM Research." 221 }, 222 "affiliations_disclosed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Author affiliations are clearly stated: IBM Research (5 authors) and IIIT Bangalore (1 author)." 226 }, 227 "funder_independent_of_outcome": { 228 "applies": true, 229 "answer": false, 230 "justification": "No funding is disclosed. IBM Research authors evaluate IBM Granite models alongside others, creating a potential conflict that is not addressed." 231 }, 232 "financial_interests_declared": { 233 "applies": true, 234 "answer": false, 235 "justification": "No competing interests statement is present. IBM authors evaluate IBM Granite, which could represent a financial interest." 236 } 237 }, 238 "contamination": { 239 "training_cutoff_stated": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper does not evaluate pre-trained model capabilities on a benchmark. The LLMs are used to generate agent traces, not evaluated for their knowledge. The anomaly detection is performed by classical ML models on extracted features." 243 }, 244 "train_test_overlap_discussed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Same as above — classical ML anomaly detection on extracted features, not evaluating pre-trained LLM capabilities." 248 }, 249 "benchmark_contamination_addressed": { 250 "applies": false, 251 "answer": false, 252 "justification": "Same as above — the benchmark is newly created trace data, and the evaluated models are classical ML, not pre-trained LLMs." 253 } 254 }, 255 "human_studies": { 256 "pre_registered": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. Annotators labeled synthetic trace data." 260 }, 261 "irb_or_ethics_approval": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "demographics_reported": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "inclusion_exclusion_criteria": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "randomization_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "blinding_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 }, 286 "attrition_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants." 290 } 291 }, 292 "cost_and_practicality": { 293 "inference_cost_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No inference costs, API costs, or token usage for the anomaly detection pipeline are reported." 297 }, 298 "compute_budget_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "No compute budget or hardware information is provided for either trace generation or model training." 302 } 303 }, 304 "experimental_rigor": { 305 "seed_sensitivity_reported": { 306 "applies": true, 307 "answer": false, 308 "justification": "No mention of random seeds or seed sensitivity. Results appear to be single-run." 309 }, 310 "number_of_runs_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "The number of experimental runs is not stated. Results are presented as single values." 314 }, 315 "hyperparameter_search_budget": { 316 "applies": true, 317 "answer": false, 318 "justification": "Grid search is mentioned but the search budget (number of configurations, ranges) is not reported." 319 }, 320 "best_config_selection_justified": { 321 "applies": true, 322 "answer": true, 323 "justification": "Section 3.2: 'Hyperparameters were tuned via grid search on the validation set' — selection on validation set, not test." 324 }, 325 "multiple_comparison_correction": { 326 "applies": true, 327 "answer": false, 328 "justification": "Multiple models are compared across two datasets with no statistical tests or multiple comparison corrections." 329 }, 330 "self_comparison_bias_addressed": { 331 "applies": true, 332 "answer": false, 333 "justification": "The authors created the datasets and evaluated all methods themselves. No acknowledgment of self-comparison bias." 334 }, 335 "compute_budget_vs_performance": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of compute costs for different methods. Some methods (XGBoost vs K-Means) have very different compute profiles but this is not addressed." 339 }, 340 "benchmark_construct_validity": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether the two synthetic agent systems and their artificially generated traces actually represent real-world multi-agent failures. Construct validity is a major concern for synthetic benchmarks." 344 }, 345 "scaffold_confound_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "The anomaly detection models are classical ML, not scaffolded LLM systems. No scaffold confound applies." 349 } 350 }, 351 "data_leakage": { 352 "temporal_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of temporal ordering in train/test split. Traces from the same input prompts with different LLMs/prompts could appear in both train and test." 356 }, 357 "feature_leakage_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether extracted features (e.g., error status in spans) trivially reveal the label, which would make detection artificially easy." 361 }, 362 "non_independence_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "Traces generated from the same input prompts with different LLMs/prompt types are likely structurally similar. No discussion of whether train/test split respects prompt-level independence." 366 }, 367 "leakage_detection_method": { 368 "applies": true, 369 "answer": false, 370 "justification": "No leakage detection or prevention method is applied." 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "XGBoost achieves accuracies up to 98% and 94% on the Stock Market and Research Writing datasets respectively.", 377 "evidence": "Table 2 shows XGBoost at 98.03% accuracy on Stock Market and 94.81% on Research Writing (Section 3.3).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Semi-supervised SVDD performs comparably to supervised methods, achieving 96% and 89% accuracy.", 382 "evidence": "Table 2 shows SVDD at 96.47% and 89.63% on the two datasets (Section 3.3).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "This is the first systematic study of anomaly detection in Multi-Agentic AI systems.", 387 "evidence": "Section 1 states 'To the best of our knowledge no publicly available datasets currently exist' and 'established techniques for AI Agents remain unexplored.'", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Path-level features are the most important for anomaly detection.", 392 "evidence": "SHAP analysis in Section 3.3 shows tool_count, total_steps, unique_steps, and agent_count are consistently top-ranked.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "False negatives are caused by subtle drift anomalies that resemble normal traces in feature space.", 397 "evidence": "Error analysis in Section 3.3 with three insights showing FNs have feature values close to normal traces, supported by t-SNE visualization (Figure 2).", 398 "supported": "moderate" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "Potential feature leakage", 404 "detail": "Error status is extracted directly from trace spans as a feature. Since 'errors' is one of the anomaly types, including error-derived features could make detection trivially easy for a large portion of anomalies. The 98% accuracy may partly reflect this circularity." 405 }, 406 { 407 "flag": "Synthetic benchmark validity", 408 "detail": "Both datasets are generated from controlled systems with known configurations. Real-world multi-agent systems would have far more diverse and subtle failure modes. The high accuracies may not transfer." 409 }, 410 { 411 "flag": "Company evaluating own product", 412 "detail": "IBM Research authors include IBM Granite as one of three LLMs without disclosing potential conflicts of interest or funding." 413 }, 414 { 415 "flag": "No variance or statistical tests", 416 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or significance tests despite comparing 8 models across 2 datasets." 417 }, 418 { 419 "flag": "Train/test independence concern", 420 "detail": "With 525 prompts generating 4,275 traces (~8 traces per prompt), traces from the same prompt likely share structural features. Random 70-15-15 split may place correlated traces in both train and test, inflating accuracy." 421 }, 422 { 423 "flag": "Low inter-annotator agreement on one dataset", 424 "detail": "Cohen's kappa of 80.6% on the Research Writing dataset suggests substantial ambiguity in what constitutes an anomaly, yet the paper treats labels as ground truth." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Why do multi-agent llm systems fail?", 430 "authors": [ 431 "Mert Cemri", 432 "Melissa Z Pan", 433 "Shuyi Yang" 434 ], 435 "year": 2025, 436 "arxiv_id": "2503.13657", 437 "relevance": "Directly studies failure modes in multi-agent LLM systems." 438 }, 439 { 440 "title": "Multi-agent risks from advanced ai", 441 "authors": [ 442 "Lewis Hammond", 443 "Alan Chan", 444 "Jesse Clifton" 445 ], 446 "year": 2025, 447 "arxiv_id": "2502.14143", 448 "relevance": "Comprehensive analysis of risks in multi-agent AI systems relevant to safety research." 449 }, 450 { 451 "title": "SentinelAgent: Graph-based anomaly detection in multi-agent systems", 452 "authors": [ 453 "Xu He", 454 "Di Wu", 455 "Yan Zhai", 456 "Kun Sun" 457 ], 458 "year": 2025, 459 "arxiv_id": "2505.24201", 460 "relevance": "Graph-based anomaly detection approach for multi-agent systems, closely related work." 461 }, 462 { 463 "title": "ReAct: Synergizing reasoning and acting in language models", 464 "authors": [ 465 "Shunyu Yao", 466 "Jeffrey Zhao", 467 "Dian Yu" 468 ], 469 "year": 2022, 470 "arxiv_id": "2210.03629", 471 "relevance": "Foundational prompting pattern used in the agent system designs evaluated in this paper." 472 }, 473 { 474 "title": "XGBoost: A scalable tree boosting system", 475 "authors": [ 476 "Tianqi Chen", 477 "Carlos Guestrin" 478 ], 479 "year": 2016, 480 "doi": "10.1145/2939672.2939785", 481 "relevance": "Core ML method used as top-performing supervised anomaly detector in the study." 482 }, 483 { 484 "title": "A unified approach to interpreting model predictions", 485 "authors": [ 486 "Scott M Lundberg", 487 "Su-In Lee" 488 ], 489 "year": 2017, 490 "relevance": "SHAP framework used for feature importance analysis of anomaly detection models." 491 } 492 ], 493 "engagement_factors": { 494 "practical_relevance": { 495 "score": 1, 496 "justification": "Addresses a real problem (silent agent failures) but datasets aren't released yet and the techniques are standard ML classifiers, not a usable tool." 497 }, 498 "surprise_contrarian": { 499 "score": 1, 500 "justification": "The finding that semi-supervised methods nearly match supervised ones is mildly interesting but not shocking; otherwise results confirm expected ML baselines." 501 }, 502 "fear_safety": { 503 "score": 1, 504 "justification": "Silent failures in agentic systems touch on reliability concerns but the paper frames it as an engineering/monitoring problem, not a safety risk." 505 }, 506 "drama_conflict": { 507 "score": 0, 508 "justification": "No controversy, no challenge to specific claims or companies; straightforward benchmarking paper." 509 }, 510 "demo_ability": { 511 "score": 0, 512 "justification": "Datasets and code are not yet released ('will be released after paper acceptance'), so nothing to try." 513 }, 514 "brand_recognition": { 515 "score": 1, 516 "justification": "IBM Research is a recognized institution but not a top-tier ML hype brand; no famous product involved." 517 } 518 } 519 }