scan.json (25636B)
1 { 2 "paper": { 3 "title": "From Benchmarks to Business Impact: Deploying IBM Generalist Agent in Enterprise Production", 4 "authors": ["Segev Shlomov", "Alon Oved", "Sami Marreed", "Ido Levy", "Offer Akrabi", "Avi Yaeli", "Łukasz Strak", "Elizabeth Koumpan", "Yinon Goldshtein", "Eilam Shapira", "Nir Mashkif", "Asaf Adi"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.23856", 8 "doi": "10.48550/arXiv.2510.23856" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "case-study"], 13 "key_findings": "CUGA achieves state-of-the-art on WebArena (61.7%) and AppWorld Test-Challenge (48.2% scenario completion). In a BPO talent acquisition pilot with 26 tasks across 13 APIs, CUGA reached 87% accuracy. The authors report preliminary estimates of ~90% reduction in development time and ~50% cost reduction compared to specialized agents, though these are based on controlled simulations without statistical significance testing.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The abstract states CUGA has been open-sourced with a GitHub link: https://github.com/cuga-project/cuga-agent (footnote 1)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The BPO-TA benchmark is described in detail (26 tasks, 13 APIs) but no download link or public release is provided. The benchmark data itself is not made available." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements files, or dependency details are provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture but does not give instructions for replicating benchmark results." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are point estimates (e.g., 61.7% WebArena, 87% BPO-TA) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper explicitly acknowledges results are 'not formally tested for statistical significance (Dror et al. 2018, 2020)' in Section 6.2." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports effect sizes with baseline context: '~90% improvement' in time-to-answer (from ~20 min to ~2-5 min), reproducibility improvement from ~60% to ~95%, and provenance coverage from ~40% to ~92% (Table 4)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 26 BPO-TA tasks or the specific benchmark sizes are adequate. No power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures reported for any results. Single-run numbers throughout." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "WebArena and AppWorld results compared against published agents (Table 5: Operator, Jace.AI, ScribeAgent, etc.; Table 7: Chen et al., Gupta et al., ReAct). BPO-TA compared against 'vanilla ReAct baseline' (62% vs 79% valid-first-try)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include recent systems: OpenAI Operator (2025), Jace.AI (2024), ScribeAgent (2024), and concurrent AppWorld entries." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 6.1 mentions ablations: 'reflective retries (-11 points without) and variable tracking (-15 reproducibility without).' However, only two ablation results are reported with minimal detail." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics used: Task Goal Completion and Scenario Goal Completion for AppWorld; accuracy, valid-first-try rate, provenance log rate, latency, and analyst-reported reproducibility for BPO-TA (Table 3)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 6.3 reports qualitative feedback from BPO architects and analysts. Table 3 includes 'Analyst-Reported Reproducibility' score of 4.6/5. However, this is informal feedback, not a structured user study." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "WebArena and AppWorld are established benchmarks with fixed test sets. AppWorld distinguishes Test-Normal and Test-Challenge." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "WebArena results broken down per application (Table 1: GitLab, Map, Reddit, Shopping, etc.). AppWorld broken down by difficulty level (Table 2). BPO-TA task categories described (Section 6.1)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.1 notes 'failures concentrated on unsupported cross-application queries where graceful degradation is expected.' The BPO-TA benchmark explicitly includes graceful failure tasks." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multi-App performance on WebArena is notably low (35.4% vs 61.7% overall). The paper acknowledges the system is 'still on its journey toward full production deployment' and discusses limitations of early architectures (Section 3.1)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are hedged: 'approached the accuracy of specialized agents,' 'indicating potential,' 'preliminary evaluations.' Results in Tables 1-4 support these hedged claims." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The ablation claims ('reflective retries -11 points,' 'variable tracking -15 reproducibility') are causal but reported without detail on experimental design. The '90% reduction in development time' is stated as an estimate without causal methodology." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper is careful to bound claims: 'preliminary evaluations,' 'pilot-level results,' 'controlled test environments and limited analyst feedback rather than full production deployment' (Section 6.2). Title says 'Enterprise Production' but body consistently hedges." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the results. The improvement over ReAct baseline could be due to many factors (more compute, better prompts, etc.) but this is not explored." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures task accuracy on BPO-TA and frames it as 'enterprise readiness' and 'business value' without discussing the gap between benchmark accuracy on 26 curated tasks and actual business impact in production." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "AppWorld results use 'GPT-4.1' (Table 7) but no snapshot date or API version. WebArena model not specified in the paper. No version details for the LLMs used in CUGA's various sub-agents." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes 'schema-grounded prompting' and prompt design principles but does not provide actual prompt text used in experiments." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters reported: no temperature, top-p, max tokens, or other LLM settings mentioned anywhere." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The architecture is described in detail: hierarchical planner-executor loops, chat layer, outer/inner loops, API Sub Agent, Browser Sub Agent, Plan Controller, ShortlisterAgent, CodeAgent, etc. (Section 5, Figures 2-4, Appendix A-B)." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The BPO-TA benchmark is described at a high level (task categories, API endpoints) but the data preprocessing steps, how tasks were curated, and how gold-standard answers were created are not documented." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section. Some limitations are scattered in Section 7 (Lessons Learned) and Section 6.2, but there is no substantive 'Limitations' or 'Threats to Validity' section." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity discussed. The paper acknowledges results are 'preliminary' and 'pilot-level' but does not discuss specific threats like selection bias in BPO-TA task design, single-domain evaluation, or potential measurement artifacts." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states scope: 'still on its journey toward full production deployment,' 'controlled test environments and limited analyst feedback rather than full production deployment' (Section 6.2), pilot is read-only only. Section 7 outlines 'next steps' implying what was NOT done." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data available. BPO-TA benchmark data, detailed per-task results, and analyst feedback data are not released." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "BPO-TA benchmark design is described: 26 tasks across 13 read-only APIs, task categories (lookup, join, looped reasoning, provenance, graceful failure), drawn from analyst practice (Section 6.1, Table 9)." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper mentions 'recruiters and analysts' provided feedback but does not describe how many, how they were selected, or the feedback collection process." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "No documentation of the pipeline from task creation to benchmark evaluation. How gold-standard answers were generated, how accuracy was scored, and how the 26 tasks were selected from possible tasks is not described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source or acknowledgments section. All authors are IBM Research or IBM Consulting employees, but no explicit funding disclosure." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors clearly listed as IBM Research or IBM Consulting with institutional emails." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "IBM employees are evaluating IBM's own CUGA system on IBM's own BPO business. IBM has a direct financial interest in showing CUGA works for enterprise deployment." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement. IBM authors evaluating IBM's product with potential commercial implications — this conflict is not explicitly acknowledged." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff stated for GPT-4.1 or any models used in CUGA's pipeline." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether WebArena or AppWorld tasks could appear in the training data of the underlying LLMs." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "WebArena was published in 2023 and AppWorld in 2024. Models trained after these dates may have seen benchmark content. This contamination risk is not addressed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "The qualitative feedback from analysts is informal and does not constitute a human subjects study requiring pre-registration." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No formal human subjects study conducted. Informal analyst feedback does not require IRB." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No formal human participants study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No formal human participants study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No formal human participants study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No formal human participants study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No formal human participants study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 3 reports 'Average Latency per Query: 11.2s' for BPO-TA. Table 2 reports average interactions per task level for AppWorld. However, no monetary cost is reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget stated — no GPU hours, API costs, or total spend for running the benchmarks or pilot." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. All results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "Number of experimental runs not stated for any benchmark evaluation." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget reported despite the system having many configurable components." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of how the final system configuration was selected or how many configurations were tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "IBM authors evaluate their own CUGA system against baselines without acknowledging self-evaluation bias. Lucic et al. (2018) concern applies directly." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No comparison of compute budgets between CUGA and baselines. CUGA's hierarchical multi-agent architecture likely uses substantially more compute than simpler baselines, but this is not quantified or discussed." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether WebArena and AppWorld actually measure enterprise readiness. The paper argues enterprise deployment requires different evaluation but does not question the construct validity of the academic benchmarks it uses for SOTA claims." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "CUGA is a complex scaffold compared to baselines (ReAct, etc.). The paper does not separate model capability from scaffold contribution. Comparisons in Tables 5 and 7 mix different scaffolds." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. WebArena (2023) and AppWorld (2024) existed before model training — potential solutions could be in training data." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup provides information not available in real deployment." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training data and benchmark content." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "CUGA achieves state-of-the-art performance on WebArena with 61.7% accuracy", 365 "evidence": "Table 1 and Table 5 show per-application breakdown and leaderboard comparison. CUGA leads with 61.7% vs Operator at 58.1%.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "CUGA achieves state-of-the-art on AppWorld Test-Challenge with 48.2% scenario goal completion", 370 "evidence": "Table 2 and Table 7 show per-level results. CUGA leads with 57.6% TGC / 48.2% SGC on Challenge vs next best Chen et al. at 47.2% / 28.8%.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "CUGA achieves 87% accuracy on the BPO-TA benchmark (26 tasks)", 375 "evidence": "Table 3 reports 87% task accuracy. However, this is on a 26-task benchmark designed by the same team, with no external validation.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "CUGA can reduce development time by up to 90% and cost by up to 50% compared to specialized agents", 380 "evidence": "Section 7 and Table 4 present these as 'estimated benefits' from 'controlled simulations' and 'internal projections.' The paper explicitly notes these are 'not formally tested for statistical significance.'", 381 "supported": "weak" 382 }, 383 { 384 "claim": "Reflective retries improve accuracy by 11 points and variable tracking improves reproducibility by 15 points", 385 "evidence": "Section 6.1 mentions ablation results but provides no details on methodology, base rates, or statistical testing.", 386 "supported": "weak" 387 }, 388 { 389 "claim": "Generalist agents can enable measurable business value in enterprise contexts", 390 "evidence": "Based on a pre-deployment pilot with read-only access and 26 curated tasks. The system is 'under consideration for production rollout' (Section 3.2), not yet deployed.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Company evaluating its own product", 397 "detail": "All authors are IBM employees evaluating IBM's CUGA system on IBM's BPO business. The BPO-TA benchmark was designed by the same team. No independent evaluation or third-party validation." 398 }, 399 { 400 "flag": "Claims exceed evidence", 401 "detail": "The title says 'Enterprise Production' and 'Business Impact' but the system is in pre-deployment pilot with 26 curated read-only tasks. The '90% time reduction' and '50% cost reduction' are projections from simulations, not measured production outcomes." 402 }, 403 { 404 "flag": "No statistical rigor", 405 "detail": "Paper explicitly acknowledges results are 'not formally tested for statistical significance' yet makes comparative claims. No error bars, no variance, no multiple runs reported." 406 }, 407 { 408 "flag": "Self-designed benchmark", 409 "detail": "The BPO-TA benchmark (26 tasks) was created by the same team that built the system being evaluated. Task selection, gold-standard answers, and evaluation criteria are all internal with no external review." 410 }, 411 { 412 "flag": "Benchmark SOTA claims without contamination analysis", 413 "detail": "WebArena (2023) and AppWorld (2024) were published before model training cutoffs. No contamination analysis despite claiming state-of-the-art." 414 }, 415 { 416 "flag": "Tiny benchmark for business impact claims", 417 "detail": "26 tasks is very small for claims about enterprise readiness and business impact. The qualitative evidence is from unstructured feedback with no reported methodology." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "React: Synergizing reasoning and acting in language models", 423 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"], 424 "year": 2022, 425 "relevance": "Foundational agentic paradigm combining chain-of-thought reasoning with environment actions." 426 }, 427 { 428 "title": "Executable code actions elicit better llm agents", 429 "authors": ["Xingyao Wang"], 430 "year": 2024, 431 "relevance": "CodeAct approach for code-centric agent execution, relevant to agentic code generation." 432 }, 433 { 434 "title": "Autogen: Enabling next-gen LLM applications via multi-agent conversations", 435 "authors": ["Qingyun Wu"], 436 "year": 2024, 437 "relevance": "Multi-agent framework for orchestrating agent conversations, directly compared architecture pattern." 438 }, 439 { 440 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 441 "authors": ["Shuyan Zhou"], 442 "year": 2023, 443 "relevance": "Primary benchmark used to evaluate CUGA's web agent capabilities." 444 }, 445 { 446 "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents", 447 "authors": ["Harsh Trivedi"], 448 "year": 2024, 449 "relevance": "Primary benchmark for multi-application API agent evaluation." 450 }, 451 { 452 "title": "Reflexion: Language agents with verbal reinforcement learning", 453 "authors": ["Noah Shinn"], 454 "year": 2023, 455 "relevance": "Reflective retry mechanism used in CUGA's architecture." 456 }, 457 { 458 "title": "St-webagentbench: A benchmark for evaluating safety and trustworthiness in web agents", 459 "authors": ["Ido Levy"], 460 "year": 2024, 461 "relevance": "Safety/trustworthiness benchmark for web agents, from same research group." 462 }, 463 { 464 "title": "Tau-bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains", 465 "authors": ["Shunyu Yao"], 466 "year": 2025, 467 "relevance": "Benchmark for tool-agent-user dynamics and policy/instruction following." 468 }, 469 { 470 "title": "Magentic-one: A generalist multi-agent system for solving complex tasks", 471 "authors": ["Adam Fourney"], 472 "year": 2024, 473 "arxiv_id": "2411.04468", 474 "relevance": "Competing generalist multi-agent system for complex task solving." 475 }, 476 { 477 "title": "The hitchhiker's guide to testing statistical significance in natural language processing", 478 "authors": ["Rotem Dror", "Gili Baumer", "Segev Shlomov", "Roi Reichart"], 479 "year": 2018, 480 "relevance": "Statistical testing methodology for NLP — cited by authors as the standard they did NOT follow." 481 }, 482 { 483 "title": "Towards enterprise-ready computer using generalist agent", 484 "authors": ["Sami Marreed"], 485 "year": 2025, 486 "arxiv_id": "2503.01861", 487 "relevance": "Companion paper describing CUGA's hierarchical architecture in detail." 488 }, 489 { 490 "title": "From grounding to planning: Benchmarking bottlenecks in web agents", 491 "authors": ["Segev Shlomov"], 492 "year": 2024, 493 "arxiv_id": "2409.01927", 494 "relevance": "Identifies planning as dominant bottleneck in web agents, motivating CUGA's planner-executor design." 495 } 496 ] 497 }