scan.json (27353B)
1 { 2 "paper": { 3 "title": "Stop Wasting Your Tokens: Towards Efficient Runtime Multi-Agent Systems", 4 "authors": ["Fulin Lin", "Shaowen Chen", "Ruishan Fang", "Hongwei Wang", "Tao Lin"], 5 "year": 2025, 6 "venue": "arXiv (under review)", 7 "arxiv_id": "2510.26585", 8 "doi": "10.48550/arXiv.2510.26585" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "SupervisorAgent, a lightweight meta-agent framework for runtime MAS supervision, reduces token consumption by ~29.7% on GAIA without compromising success rates. The framework's three strategies (error correction, inefficiency guidance, observation purification) are validated across 6 benchmarks and 3 foundation models. Ablation shows purification drives token savings while correction/guidance maintain accuracy. The approach generalizes across MAS frameworks (Smolagent, AWorld, OAgents).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository URL provided in abstract: https://github.com/LINs-lab/SupervisorAgent." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All evaluation benchmarks are publicly available standard datasets (GAIA, GSM8k-Hard, AIME 2024, HumanEval, MBPP, DROP). No proprietary data was collected." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Only model names are mentioned." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "Implementation details are described in Appendix A.3 and prompts in A.5, but no step-by-step reproduction instructions or runnable scripts are provided in the paper itself. The GitHub repo may contain these but the paper does not." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "No confidence intervals or error bars are reported in any results table. Tables 1-4 report point estimates only. Figure 3 shows violin plots of distributions but no formal CIs." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are performed. Claims like '29.68% token reduction' and accuracy comparisons are based solely on comparing point estimates without any statistical test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements are consistently reported with baseline context, e.g., '29.68% token reduction' (Table 1), '23.74% token reduction on HumanEval' (Table 2), with absolute before/after numbers provided." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for sample sizes. GSM8k-Hard uses 600 samples and DROP uses 800 samples, but no rationale for these specific numbers beyond 'following the sampling strategy of prior work' for DROP." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Figure 3 shows violin plots of token cost distributions and variance reduction, but no standard deviations, IQR, or spread measures appear in any results table. Main results (Tables 1-4) are single-point estimates." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines included: Vanilla LLM, CoT-SC, CodeAgent, Smolagent, OWL, OAgents, MetaAgent, AWorld (Tables 1 and 2)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines are very recent (2025): AWorld, OAgents, OWL, MetaAgent, Smolagent. All are current state-of-the-art MAS frameworks." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 3 presents ablation removing each of the three core strategies (w/o Correction, w/o Guidance, w/o Purification), isolating their individual contributions." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two primary metrics: accuracy/F1 and token consumption. Also reports pass@k for k=1,2,3 on GAIA, and variance reduction (Figure 3)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of the system's outputs. All evaluation is automated (pass/fail on benchmarks, token counts). Given claims about 'robustness' and 'guidance quality', human evaluation of intervention quality would be relevant." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "GAIA uses the test set with 164 tasks (Appendix A.2.1 states 'we utilize the test set'). HumanEval and MBPP are standard test sets." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results broken down by GAIA difficulty level (L1, L2, L3) in Tables 1, 3, 4. Table 2 breaks down across different benchmark domains." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6 discusses overcompression causing 'minor accuracy or F1 drops' and the 'noise-as-signal' problem where purification can paradoxically harm performance. Case study in Appendix A.4 shows a baseline failure mode." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative findings reported: overcompression can harm performance (Section 6), MBPP accuracy drops slightly with SMAS (84.43 vs 85.68 in Table 2), DROP F1 drops (79.80 vs 81.08), and the noise-as-signal discovery." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims '29.45% token reduction' — Table 1 shows 29.68% at pass@1 (minor discrepancy between abstract and table). 'Without compromising success rate' — Table 1 pass@1 shows same 50.91% accuracy. Claims about 5 additional benchmarks supported by Table 2." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims ('SupervisorAgent reduces token consumption'). The ablation study (Table 3) provides controlled single-variable manipulation by removing each component individually, which is adequate for these claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Claims are reasonably bounded to tested settings. The paper tests across 6 benchmarks, 3 foundation models, and 3 MAS frameworks, and states specific results for each configuration rather than making unbounded generalizations." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No substantive discussion of alternative explanations for observed improvements. The paper does not consider whether gains come from, e.g., prompt engineering effects, specific model sensitivities, or benchmark-specific characteristics rather than the supervision framework itself." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures token consumption and task accuracy directly, which are the actual metrics claimed. No proxy gap exists — they don't frame token counts as 'efficiency' in a broader sense beyond what they measure." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions given: 'GPT-4.1 (OpenAI, 2025)', 'Gemini-2.5-pro-0605', 'Qwen3-235B-2507', 'Qwen3-32B'. Gemini and Qwen include date/version suffixes. GPT-4.1 is a specific API model." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text provided in Appendix A.5 for all supervision types: base prompt, error_occurrence, inefficiency_behavior, excessive observation length, and result_synthesis." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "The observation length threshold (3,000 characters) is stated in Appendix A.3.2, but no LLM inference hyperparameters (temperature, top-p, max tokens) are reported for any model." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Extensive scaffolding description in Section 4 (methodology) and Appendix A.3 (implementation details), including the adaptive filter, context window construction, intervention pipeline, and action execution." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Dataset sampling documented: 600 random samples from GSM8k-Hard, 800 from DROP 'following the sampling strategy of prior work', full HumanEval/MBPP, GAIA test set (164 tasks). Ablation subset defined as top-10 most token-intensive tasks per level." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 (Discussion) includes substantive discussion of limitations: overcompression risks, noise-as-signal problem, need for better purification techniques, and the limitation of token cost as sole efficiency metric." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 identifies specific threats: 'seemingly noisy information, such as HTML structure and truncation cues, serves as a vital signal for ReAct-style agents' and 'overly aggressive purification can paradoxically harm performance.' These are specific to this system." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit statement of what the results do NOT show. The paper does not specify which MAS types, task domains, or model families are excluded from their claims. Future directions mention 'self-evolving version' but don't bound current claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (per-task results, per-run outputs) is made available. Only aggregated results in tables and figures." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data collection is straightforward as all datasets are standard public benchmarks. Appendix A.2.1 describes each dataset's characteristics and the specific splits/subsets used." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All evaluation uses standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from benchmark input to final reported numbers is not fully documented. No description of how outputs were evaluated (automated scripts? manual checking?), how pass@k was computed from runs, or how token counts were aggregated." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations clearly listed: Zhejiang University and Westlake University. The work was done during Fulin's visit to Westlake." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates stated for any model used (GPT-4.1, Gemini-2.5-pro, Qwen3). This is relevant since HumanEval (2021), MBPP (2021), GSM8k, and DROP are all public benchmarks likely in training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of potential train/test overlap despite using well-known benchmarks (HumanEval, MBPP, GSM8k) that have been public for years before the models' training cutoffs." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HumanEval (2021) and MBPP (2021) have been publicly available for years and are almost certainly in the training data of GPT-4.1 and Qwen3. No contamination analysis is discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Token consumption is a primary metric reported for all experiments. Tables 1-4 report average tokens consumed per task for every method and configuration." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget stated — no GPU hours, total API spend, or wall-clock time for running experiments. Only per-task average token counts are reported." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No seed sensitivity analysis. pass@k results imply multiple runs but no reporting of variance across random seeds." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper reports pass@1, pass@2, pass@3 on GAIA implying multiple runs, but does not explicitly state the total number of runs per task. For other benchmarks (Table 2), single results are reported with no run count." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget reported. The observation length threshold (3000 chars) and other design choices appear fixed without justifying how they were selected." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The 3000-character threshold and other filter heuristics are presented without justification for how these values were selected or whether alternatives were explored." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Many comparisons made across 6 benchmarks, multiple models, and ablation variants. No correction for multiple comparisons applied, and no significance tests are used at all." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Authors implement their own system and compare against existing frameworks without acknowledging potential bias from their implementation choices or familiarity with their method." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Token consumption (a compute proxy) is explicitly reported alongside accuracy for every experiment, enabling direct efficiency-performance comparison. Tables consistently pair accuracy with token counts." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the chosen benchmarks actually measure the claimed capabilities. The paper uses GAIA for 'general problem-solving' and HumanEval for 'code generation' without questioning construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "The paper explicitly addresses this by testing SupervisorAgent across three different MAS frameworks (Smolagent, AWorld, OAgents) in Table 4, and discusses the choice of Smolagent as primary testbed due to its reliance on internal reasoning rather than external tools (Section 6)." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. HumanEval (2021) and MBPP (2021) solutions have been publicly available for years and are likely in the training data of 2025 models." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The SupervisorAgent itself has access to the global task description, which could provide hints not available in unmonitored settings." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether benchmark problems share structural similarities or whether training data contains related problems." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied despite using multiple well-known benchmarks with models likely trained on their solutions." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "SupervisorAgent reduces token consumption of Smolagent by 29.68% on GAIA at pass@1 while maintaining equivalent success rates.", 365 "evidence": "Table 1 shows 50.91% accuracy for both Smolagent and SMAS at pass@1, with average tokens dropping from 527.76K to 371.12K (29.68% reduction).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Efficiency gains are more pronounced on harder tasks, with 32.39% savings on Level 2 and 30.10% on Level 3 at pass@3.", 370 "evidence": "Table 1 pass@3 row shows L2 tokens: 605.05K→409.05K (32.39%) and L3: 611.87K→427.72K (30.10%).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "SupervisorAgent generalizes across diverse benchmarks, achieving 23.74% token reduction on HumanEval alongside an accuracy improvement.", 375 "evidence": "Table 2 shows HumanEval tokens drop from 40.91K to 31.19K (23.74%) and accuracy rises from 92.07% to 92.68%.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The framework is model-agnostic, consistently delivering token savings across GPT-4.1, Gemini-2.5-pro, and Qwen3-235B.", 380 "evidence": "Figure 4b shows consistent token reduction across all three models on GAIA, though exact numbers are only in the figure.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "SupervisorAgent significantly reduces variance in token consumption, improving performance consistency.", 385 "evidence": "Figure 3 shows violin plots with visibly tighter distributions for SMAS and bar charts showing reduced variance, especially for L2 (63% reduction mentioned in Figure 1 caption).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Purification is the primary driver of token reduction, while Correction and Guidance are critical for maintaining accuracy.", 390 "evidence": "Table 3 ablation: w/o Purification shows highest remaining token cost (851K vs 721K full). Removing Correction or Guidance maintains token savings but accuracy drops from 46.67% to 40.00%.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "SupervisorAgent generalizes across MAS frameworks, reducing tokens by 36.54% on AWorld and 39.36% on OAgents.", 395 "evidence": "Table 4 shows cross-framework results on GAIA subset with consistent token reductions.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No statistical significance testing", 402 "detail": "All claims of improvement are based on comparing point estimates without any significance tests. Given the stochastic nature of LLM outputs, observed differences could be due to sampling variance." 403 }, 404 { 405 "flag": "Benchmark contamination unaddressed", 406 "detail": "HumanEval and MBPP were published in 2021 and are almost certainly in the training data of 2025 models. This doesn't invalidate the relative comparison (both baseline and SMAS use the same models) but is a methodological gap." 407 }, 408 { 409 "flag": "Ablation on non-representative subset", 410 "detail": "The ablation study (Table 3) and cross-framework evaluation (Table 4) are conducted on only the top-10 most token-intensive tasks per GAIA level, not the full benchmark. This biases results toward cases where supervision is most impactful." 411 }, 412 { 413 "flag": "Minor abstract-table discrepancy", 414 "detail": "Abstract claims '29.45%' token reduction while Table 1 shows 29.68%. Small but suggests the abstract may have been written from a different run." 415 }, 416 { 417 "flag": "No LLM hyperparameters reported", 418 "detail": "Temperature, top-p, and other sampling parameters are not stated for any model, yet these significantly affect output quality and token usage." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Why do multi-agent llm systems fail?", 424 "authors": ["Mert Cemri", "Melissa Z. Pan", "Shuyi Yang"], 425 "year": 2025, 426 "arxiv_id": "2503.13657", 427 "relevance": "Analyzes failure modes in multi-agent LLM systems, directly motivating this work's supervision approach." 428 }, 429 { 430 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems", 431 "authors": ["Shaokun Zhang", "Ming Yin"], 432 "year": 2025, 433 "arxiv_id": "2505.00212", 434 "relevance": "Addresses failure attribution in MAS, the reactive approach that SupervisorAgent aims to surpass with proactive intervention." 435 }, 436 { 437 "title": "AgentDropout: Dynamic agent elimination for token-efficient and high-performance LLM-based multi-agent collaboration", 438 "authors": ["Zhexuan Wang", "Yutong Wang"], 439 "year": 2025, 440 "arxiv_id": "2503.18891", 441 "relevance": "Design-time MAS efficiency optimization through agent pruning, complementary to SupervisorAgent's runtime approach." 442 }, 443 { 444 "title": "Aegis: Taxonomy and optimizations for overcoming agent-environment failures in LLM agents", 445 "authors": ["Kevin Song", "Anand Jayarajan"], 446 "year": 2025, 447 "arxiv_id": "2508.19504", 448 "relevance": "Proposes failure taxonomies for LLM agents, relevant to understanding and categorizing agent failure modes." 449 }, 450 { 451 "title": "GAIA: a benchmark for general AI assistants", 452 "authors": ["Grégoire Mialon", "Clémentine Fourrier"], 453 "year": 2023, 454 "arxiv_id": "2311.12983", 455 "relevance": "Primary evaluation benchmark for multi-agent general problem-solving capabilities." 456 }, 457 { 458 "title": "Evaluating large language models trained on code", 459 "authors": ["Mark Chen", "Jerry Tworek"], 460 "year": 2021, 461 "arxiv_id": "2107.03374", 462 "relevance": "HumanEval benchmark for code generation, used as one of the evaluation benchmarks." 463 }, 464 { 465 "title": "ReAct: Synergizing reasoning and acting in language models", 466 "authors": ["Shunyu Yao", "Jeffrey Zhao"], 467 "year": 2023, 468 "arxiv_id": "2210.03629", 469 "relevance": "Foundational agent architecture pattern (ReAct) that SupervisorAgent is designed to supervise." 470 }, 471 { 472 "title": "OWL: Optimized Workforce Learning for general multi-agent assistance in real-world task automation", 473 "authors": ["Mengkang Hu", "Yuhang Zhou"], 474 "year": 2025, 475 "arxiv_id": "2505.23885", 476 "relevance": "Multi-agent framework used as a baseline for comparison." 477 }, 478 { 479 "title": "Efficient agents: Building effective agents while reducing cost", 480 "authors": ["Ningning Wang", "Xavier Hu"], 481 "year": 2025, 482 "arxiv_id": "2508.02694", 483 "relevance": "Directly addresses agent efficiency and cost reduction, a key concern shared with this paper." 484 }, 485 { 486 "title": "MetaAgent: Automatically constructing multi-agent systems based on finite state machines", 487 "authors": ["Yaolun Zhang", "Xiaogeng Liu"], 488 "year": 2025, 489 "arxiv_id": "2507.22606", 490 "relevance": "Automatic MAS construction approach used as a baseline, representing design-time optimization." 491 }, 492 { 493 "title": "SMART: Self-aware agent for tool overuse mitigation", 494 "authors": ["Cheng Qian", "Emre Can Acikgoz"], 495 "year": 2025, 496 "arxiv_id": "2502.11435", 497 "relevance": "Addresses tool overuse in agents, related to the efficiency and intervention themes of SupervisorAgent." 498 }, 499 { 500 "title": "AgentPoison: Red-teaming LLM agents via poisoning memory or knowledge bases", 501 "authors": ["Zhaorun Chen", "Zhen Xiang"], 502 "year": 2024, 503 "arxiv_id": "2407.12784", 504 "relevance": "Demonstrates memory poisoning vulnerabilities in LLM agents, motivating SupervisorAgent's memory interaction monitoring." 505 } 506 ] 507 }