scan.json (27213B)
1 { 2 "paper": { 3 "title": "Beyond Task Completion: An Assessment Framework for Evaluating Agentic AI Systems", 4 "authors": [ 5 "Sreemaee Akshathala", 6 "Bassam Adnan", 7 "Mahisha Ramesh", 8 "Karthik Vaidhyanathan", 9 "Basil Muhammed", 10 "Kannan Parthasarathy" 11 ], 12 "year": 2025, 13 "venue": "arXiv preprint", 14 "arxiv_id": "2512.12791" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The paper provides a GitHub URL (https://github.com/sa4s-serc/asf) in the introduction and again in Section 4.2 ('Our experiments and results are available on GitHub'), indicating code and experiment trajectories have been released." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "The synthetic CloudOps data (instances, logs, policies) was generated using Sonnet 4.5 but no dataset download link or release is mentioned. The GitHub repository contains experiment trajectories but the underlying synthetic dataset is not described as separately released." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "The paper mentions GPT-4o with temperature 0.7, ChromaDB for embeddings, and text-embedding-3-small, and the Mem0 library with 'default settings', but no requirements.txt, Dockerfile, or conda environment file with library versions is provided in the paper. The setup description is partial." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "While a GitHub link is provided, the paper itself does not include step-by-step reproduction instructions. The GitHub repository may contain them but the paper does not describe the commands or procedure needed to reproduce the results." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "Results in Tables 3-7 are reported as point estimates only (e.g., '33.7%' precision, '37.9%' recall). While Figure 3 shows distribution plots across three runs (which implies spread), no confidence intervals or explicit error bars are reported in the tables." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "RQ1 frames a comparison between baseline and framework evaluation approaches, but no statistical significance tests are used to support the claim that the framework is more effective. The comparison relies on showing the framework reveals additional metrics, without formal hypothesis testing." 49 }, 50 "effect_sizes_reported": { 51 "applies": false, 52 "answer": false, 53 "justification": "No comparative effect sizes are computed. The paper reports descriptive metrics (completion rates, precision/recall) but does not compare two competing methods with effect size statistics." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "Only three scenarios (S1, S2, S3), each run three times, are used to validate the framework. No justification for why three scenarios or three runs are sufficient is provided; no power analysis is discussed." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "Each scenario was executed three times and results were 'averaged to ensure reliability', but Tables 3-6 report only the averages. Table 7 reports 'average failures per pillar' but no standard deviation or range is given. Figure 3 shows token/cost distributions visually but without numerical variance statistics." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Table 3 explicitly compares a 'Baseline (B)' evaluation (capturing only task completion and tool usage ratio) against the proposed 'Framework (F)' metrics across all three scenarios, directly addressing RQ1." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The baseline is a standard task-completion + tool-usage-ratio evaluation, which the paper correctly identifies as the current dominant practice. The comparison is fair given the paper's goal of showing what standard metrics miss." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": false, 80 "justification": "Table 7 is labeled 'Pillar ablation' but shows a per-category breakdown of failure counts across pillars, not a proper ablation study. No pillar is removed from the framework to measure its contribution — the table simply counts how many failures each category detects. A true ablation would remove a pillar and show the framework's effectiveness degrades." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The framework uses many metrics including task completion, tool usage ratio, tool sequence correctness, expected tool calls, policy adherence, dependency inquiry, memory P/R/F1, and judge scores (completion, safety, memory, reasoning, overall)." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "All evaluation is automated (LLM-as-Judge and Agent-as-Judge). No human experts validate whether the framework's detected failures are genuine. For a framework claiming to identify behavioral failures, human validation of those findings would be relevant." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": false, 95 "justification": "The three CloudOps scenarios were designed based on production experiences at MontyCloud and used directly for evaluation. There is no separate held-out test set; the same scenarios are used for framework validation without a train/test split." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down per pillar (LLM, Memory, Tools, Environment) in Table 7, and per scenario (S1, S2, S3) throughout. Table 4 further decomposes memory retrieval into Single-hop, Multi-hop, and Temporal types." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 4.2 and Table 7 describe concrete failure examples (e.g., 'Skipped policy validation before instance termination', 'Did not recall previous role mappings or configuration changes'). Section 5 also discusses specific failure patterns observed." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports substantial failures across all pillars: S1 had only 33% policy adherence, S2 had 13.1% memory recall, S3 had 7.67 average tool failures. The paper explicitly presents these negative results to demonstrate gaps in baseline evaluation." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims the framework 'reveal[s] behavioral deviations overlooked by conventional metrics', which is supported by Tables 3-7 showing framework metrics surface failures (e.g., 33% policy adherence) hidden by 100% task completion in S2." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper makes implicit causal claims (e.g., 'tool sequencing and memory management need particular attention in multi-agent scenarios') derived from observational differences across three scenarios with different complexity. No controlled single-variable manipulation isolates the cause of observed failures." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper frames the framework as general ('an end-to-end Agent Assessment Framework') but validates it only on CloudOps scenarios with a single LLM (GPT-4o). Section 5.1 acknowledges external validity threats but the title and framing imply broad applicability beyond what was tested." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "Section 5.1 Threats to Validity notes that 'Ground truth contracts defined by us may not reflect all valid solution paths' and 'Judge-based protocols introduce potential LLM evaluation biases', but these are mostly generic limitations. No alternative explanations for WHY specific failures occurred are systematically considered." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper uses 'GPT-4o' without specifying a version or snapshot date. It also mentions 'Sonnet 4.5' for synthetic data generation and 'text-embedding-3-small' for embeddings. 'GPT-4o' without a version or API snapshot date does not count as a specified version per schema criteria." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper mentions 'structured prompts' for LLM-as-Judge (Table 5) and describes the evaluation protocol, but no actual prompt text is provided in the paper or appendix. Section 3.3 describes that the LLM-as-Judge 'receives execution logs and produces scored assessments' but the actual prompt is not shown." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "Temperature (0.7) is reported for GPT-4o, and 'default settings' of Mem0 are mentioned. However, top-p, max tokens, and other LLM API settings are not specified. Mem0 default settings are not enumerated." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The paper describes the agentic scaffolding in detail: MOYA framework, memory using Mem0 with ChromaDB, tool invocation protocols, test case generation, static/dynamic/judge evaluation layers, Agent Card mechanism, and Agent-as-Judge auditor protocols. Figures 1 and 2 provide architectural diagrams." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": false, 159 "justification": "Synthetic data was 'generated using Sonnet 4.5' based on production issues encountered at MontyCloud, but the actual generation process, prompts used, volume of data, filtering criteria, and preprocessing steps are not documented." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 5.1 is titled 'Threats to Validity' and addresses external, internal, and construct validity with substantive discussion beyond a single sentence." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5.1 identifies specific threats: evaluation limited to CloudOps domain (external), uniform temperature 0.7 and single model selection (internal), metrics may not capture all reliability aspects and memory assessment excludes storage efficiency (construct). These are study-specific, not boilerplate." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "While Section 5.1 notes the framework is evaluated only on CloudOps, it does not explicitly state what the results do NOT show. Future work section gestures at extending beyond CloudOps but does not enumerate specific claims that the paper is NOT making." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "GitHub link is provided for 'experiments and results' (trajectories) but the underlying synthetic CloudOps data (instances, logs, policies) is not described as publicly available for independent verification." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": false, 188 "justification": "Synthetic data was 'generated using Sonnet 4.5 due to its ranking on LMArena' based on 'production issues we encountered', but the prompts used, generation criteria, volume, and verification procedures for the synthetic data are not described." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "There are no human participants in this study. The data is synthetic CloudOps scenarios, not collected from human subjects." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": false, 198 "justification": "The pipeline from synthetic data generation through to evaluation metrics is not fully documented. The paper describes the evaluation framework components but does not trace the full data pipeline with intermediate steps and filtering." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No acknowledgments section or funding disclosure is present in the paper. The industry collaboration with MontyCloud is described in the body, but no explicit funding statement is provided." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed on the title page: Sreemaee Akshathala, Bassam Adnan, Mahisha Ramesh, Karthik Vaidhyanathan at IIIT-Hyderabad SERC; Basil Muhammed and Kannan Parthasarathy at MontyCloud Inc. The MontyCloud affiliation is prominently disclosed." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "MontyCloud Inc. employees are co-authors of the paper, and the framework is validated on MontyCloud's production system (MOYA). MontyCloud has a direct financial interest in the framework's apparent effectiveness, making the funder non-independent of the outcome." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or declaration of financial interests is present in the paper. The MontyCloud affiliation creates an undisclosed potential conflict: two authors are MontyCloud employees evaluating their own system." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "GPT-4o is used as the agent backbone but no training data cutoff date is stated for GPT-4o. The paper does not address whether the model's training data could include the evaluation scenarios or benchmark components." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "The paper does not discuss whether GPT-4o's training data overlaps with the CloudOps scenarios, policies, or tool descriptions used in evaluation. Since scenarios are based on real production patterns, some training data overlap is plausible." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "The evaluation scenarios are custom-designed CloudOps workflows (not public benchmarks), so benchmark contamination in the traditional sense (e.g., HumanEval appearing in training data) does not apply here." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "There are no human participants in this study. The evaluation is entirely automated using LLM agents and synthetic CloudOps scenarios." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants; IRB or ethics approval is not applicable." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants; demographics are not applicable." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants; inclusion/exclusion criteria are not applicable." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants or controlled experiment with human assignment; not applicable." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants or human evaluators; blinding is not applicable." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants; attrition is not applicable." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section 4.2.3 (RQ3) explicitly reports inference costs: average $0.0621 per scenario execution run, LLM-as-Judge adds $0.0593 total, Agent-as-Judge $0.9572 total. Figure 4 shows cost breakdowns per evaluation protocol." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Section 4.2.3 reports average execution time (183.5s per run), token consumption (19,644 input tokens, 1,301 output tokens per average run), and scenario-level cost ($0.0405–$0.0818). The compute budget is quantified in practical terms." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "Conventional task-completion and tool-usage-ratio metrics miss critical behavioral failures in agentic systems.", 293 "evidence": "Table 3 shows S2 achieves 100% task completion and 0.97 tool usage ratio under baseline, but framework metrics reveal only 33% tool sequence correctness, 0% dependency inquiry, and 13.1% memory recall. Section 5 elaborates these failures.", 294 "supported": "strong" 295 }, 296 { 297 "claim": "The proposed four-pillar framework (LLM, Memory, Tools, Environment) effectively surfaces behavioral failures across all pillars.", 298 "evidence": "Table 7 shows per-pillar failure counts across all scenarios; tool failures peak at 7.67 in S3, memory failures at 3.67, LLM at 1.67. These failures are invisible in baseline evaluation.", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "LLM-as-Judge is cost-efficient for continuous monitoring ($0.0593 total overhead, 14.7s) while Agent-as-Judge provides deeper assessment at higher cost ($0.9572, 913.4s).", 303 "evidence": "Section 4.2.3 and Figure 4 report these exact cost and time figures across three scenarios.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Tool orchestration failures are the dominant failure mode in complex multi-agent scenarios.", 308 "evidence": "Table 7 shows Tools pillar has highest failure count in S3 (7.67 average), attributed to 'missed diagnostic or verification steps before applying remediation.'", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "Memory retrieval failures scale with scenario complexity, with multi-hop and temporal retrieval showing high precision but low recall.", 313 "evidence": "Table 4 shows multi-hop precision 100% but recall 26.5%, temporal reasoning precision 100% but recall 29.8% for S3.", 314 "supported": "strong" 315 } 316 ], 317 "methodology_tags": [ 318 "case-study", 319 "benchmark-eval" 320 ], 321 "key_findings": "The paper proposes a four-pillar assessment framework (LLM, Memory, Tools, Environment) for evaluating agentic AI systems beyond task completion. Validated on three CloudOps scenarios using the MOYA multi-agent framework with GPT-4o, it finds that standard task-completion metrics miss significant behavioral failures: a scenario with 100% task completion showed only 13.1% memory recall and 33% tool sequence correctness. The framework identifies tool orchestration and memory management as the highest-failure dimensions in complex multi-agent scenarios, and quantifies the cost-accuracy tradeoff between LLM-as-Judge ($0.0593, 14.7s) and Agent-as-Judge ($0.9572, 913.4s) evaluation protocols.", 322 "red_flags": [ 323 { 324 "flag": "Industry co-authorship without conflict-of-interest declaration", 325 "detail": "Two of six authors are MontyCloud Inc. employees, and the framework is validated exclusively on MontyCloud's own production system (MOYA). The paper does not include a competing interests statement, and MontyCloud has a financial interest in the framework appearing effective. The results could be selectively framed to favor MontyCloud's system." 326 }, 327 { 328 "flag": "Very small evaluation: 3 scenarios, 3 runs each", 329 "detail": "The framework is validated on only three scenarios, run three times each (nine total executions). This is an extremely small sample from which to draw conclusions about framework effectiveness. The paper provides no justification for why three scenarios are sufficient." 330 }, 331 { 332 "flag": "Overgeneralized framing vs. narrow validation", 333 "detail": "The paper presents the framework as general for 'evaluating agentic AI systems' and describes it as 'end-to-end', but validates it only on CloudOps with a single LLM (GPT-4o). Results from one domain and one model cannot support claims about general agentic AI evaluation." 334 }, 335 { 336 "flag": "LLM-as-Judge reliability not validated", 337 "detail": "The LLM-as-Judge protocol is used to assess agent performance, but the reliability of this evaluation method (inter-rater agreement, calibration against human judgment) is not validated. The paper notes 'Judge-based protocols introduce potential LLM evaluation biases' in the limitations but does not quantify or address this." 338 }, 339 { 340 "flag": "No variance statistics despite non-deterministic model", 341 "detail": "The paper explicitly motivates the need to handle non-deterministic LLM behavior, yet tables report only averages across three runs with no standard deviations. Wide variability is visible in Figure 3 (e.g., S1 input tokens range 7,784-19,335) but is not numerically reported." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Are: Scaling up agent environments and evaluations", 347 "authors": [ 348 "Andrews, P.", 349 "Benhalloum, A." 350 ], 351 "year": 2025, 352 "relevance": "Directly relevant as a benchmark/evaluation environment for agentic AI systems, cited for structured workflow evaluation methodology." 353 }, 354 { 355 "title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents", 356 "authors": [ 357 "Andriushchenko, M.", 358 "Souly, A." 359 ], 360 "year": 2024, 361 "arxiv_id": "2410.09024", 362 "relevance": "Benchmark for evaluating safety and harmful behavior in LLM agents, relevant to the survey's interest in agentic AI evaluation methodologies." 363 }, 364 { 365 "title": "Why do multi-agent LLM systems fail?", 366 "authors": [ 367 "Cemri, M.", 368 "Pan, M. Z.", 369 "Yang, S." 370 ], 371 "year": 2025, 372 "arxiv_id": "2503.13657", 373 "relevance": "Analyzes failure modes in multi-agent LLM systems, directly relevant to understanding evaluation challenges in agentic AI." 374 }, 375 { 376 "title": "AIopslab: A holistic framework to evaluate AI agents for enabling autonomous clouds", 377 "authors": [ 378 "Chen, Y." 379 ], 380 "year": 2025, 381 "relevance": "Closely related evaluation framework for AI agents in cloud operations, providing a comparison point for this paper's CloudOps focus." 382 }, 383 { 384 "title": "A survey on LLM-as-a-judge", 385 "authors": [ 386 "Gu, J.", 387 "Jiang, X." 388 ], 389 "year": 2025, 390 "relevance": "Survey of LLM-as-a-judge evaluation methodology, foundational for the judge-based evaluation protocols used in this paper." 391 }, 392 { 393 "title": "An empirical study of testing practices in open source AI agent frameworks and agentic applications", 394 "authors": [ 395 "Hasan, M. M.", 396 "Li, H." 397 ], 398 "year": 2025, 399 "relevance": "Empirical study of testing and evaluation practices in agentic AI frameworks, directly in scope for the survey." 400 }, 401 { 402 "title": "Agentic software engineering: Foundational pillars and a research roadmap", 403 "authors": [ 404 "Hassan, A. E." 405 ], 406 "year": 2025, 407 "relevance": "Research roadmap for agentic software engineering, providing context for the evaluation challenges addressed by this paper." 408 }, 409 { 410 "title": "Engineering LLM powered multi-agent framework for autonomous CloudOps", 411 "authors": [ 412 "Parthasarathy, K.", 413 "Vaidhyanathan, K." 414 ], 415 "year": 2025, 416 "relevance": "Prior work by the same group introducing the MOYA framework used as the evaluation target in this paper." 417 }, 418 { 419 "title": "AgentIF: Benchmarking instruction following of large language models in agentic scenarios", 420 "authors": [ 421 "Qi, Y.", 422 "Peng, H." 423 ], 424 "year": 2025, 425 "arxiv_id": "2505.16944", 426 "relevance": "Benchmark for instruction-following in agentic scenarios, relevant to the LLM pillar evaluation in this framework." 427 }, 428 { 429 "title": "A survey on the memory mechanism of large language model based agents", 430 "authors": [ 431 "Zhang, Z.", 432 "Bo, X." 433 ], 434 "year": 2024, 435 "relevance": "Survey of memory mechanisms in LLM agents, relevant to the Memory pillar evaluation methodology in this paper." 436 }, 437 { 438 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems", 439 "authors": [ 440 "Zhang, S." 441 ], 442 "year": 2025, 443 "relevance": "Automated failure attribution in multi-agent LLM systems, directly relevant to the failure analysis methodology in this paper." 444 }, 445 { 446 "title": "Evaluating very long-term conversational memory of LLM agents", 447 "authors": [ 448 "Maharana, A.", 449 "Lee, D.-H.", 450 "Tulyakov, S.", 451 "Bansal, M.", 452 "Barbieri, F.", 453 "Fang, Y." 454 ], 455 "year": 2024, 456 "arxiv_id": "2402.17753", 457 "relevance": "Evaluates memory in LLM agents, directly relevant to the Memory pillar of the proposed assessment framework." 458 }, 459 { 460 "title": "Evaluating memory in LLM agents via incremental multi-turn interactions", 461 "authors": [ 462 "Hu, Y.", 463 "Wang, Y.", 464 "McAuley, J." 465 ], 466 "year": 2025, 467 "arxiv_id": "2507.05257", 468 "relevance": "Memory evaluation methodology for LLM agents, relevant to the survey's coverage of agentic AI evaluation." 469 } 470 ] 471 }