scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25802B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Benchmarks to Business Impact: Deploying IBM Generalist Agent in Enterprise Production",
      6     "authors": [
      7       "Segev Shlomov",
      8       "Alon Oved",
      9       "Sami Marreed",
     10       "Ido Levy",
     11       "Offer Akrabi",
     12       "Avi Yaeli",
     13       "Łukasz Strak",
     14       "Elizabeth Koumpan",
     15       "Yinon Goldshtein",
     16       "Eilam Shapira",
     17       "Nir Mashkif",
     18       "Asaf Adi"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2510.23856",
     23     "doi": "10.48550/arXiv.2510.23856"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "SOTA benchmark claims are supported by Tables 1, 2, 5, 7; business impact claims use appropriately hedged language ('preliminary evaluations', 'indicating potential') throughout the abstract.",
     31         "source": "haiku"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Ablation claims ('reflective retries: -11 points', 'variable tracking: -15 reproducibility') are based on a 26-task benchmark with no statistical testing — differences of ~3 tasks are reported as causal without adequate design.",
     37         "source": "haiku"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Section 4 makes broad enterprise readiness claims drawing from informal 'discussions with Finance, Sales, Procurement, Legal' without systematic evidence; a single BPO-TA pilot supports sweeping 'enterprise-ready' conclusions.",
     43         "source": "haiku"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No alternative explanations are offered for CUGA's benchmark gains — e.g., whether improvements stem from the hierarchical architecture or from using a stronger base LLM (GPT-4.1 vs. GPT-4o for baselines).",
     49         "source": "haiku"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Benchmark accuracy is used as proxy for enterprise readiness without systematic discussion of the gap; the 90%/50% development savings are projections from simulated workflows but are presented alongside measured results without clear labeling.",
     55         "source": "haiku"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No dedicated limitations section exists; Section 7 'Lessons Learned' mentions preliminary nature and simulation constraints but is framed as forward-looking rather than a systematic limitations discussion.",
     63         "source": "haiku"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper acknowledges 'not formally tested for statistical significance' and 'controlled test environments' but names no specific threats such as selection bias, single-domain generalization risk, or small-sample effects.",
     69         "source": "haiku"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No explicit statement bounds generalization (e.g., 'these results do not demonstrate enterprise readiness in other domains'); 'preliminary' qualifiers appear but scope limits are not formally stated.",
     75         "source": "haiku"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding statement is present anywhere in the paper; the work is implicitly IBM-funded through employment but no explicit disclosure is made.",
     83         "source": "haiku"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All authors are listed as IBM Research or IBM Consulting employees, clearly disclosed in the author affiliations block.",
     89         "source": "haiku"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "IBM employees evaluate IBM's own proprietary system (CUGA) deployed in IBM's own BPO business unit — the implicit funder is directly interested in a positive outcome.",
     95         "source": "haiku"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
    101         "source": "haiku"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are reasonably defined: 'generalist agent' is defined as 'single systems designed to perform diverse computer-use tasks,' 'BPO' and 'TA' are explained, and Section 4 enumerates enterprise requirements explicitly.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper explicitly lists five contributions: enterprise pilot experience, BPO-TA benchmark, architectural advances, preliminary business impact, and lessons learned.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 2 provides detailed related work covering ReAct, CodeAct, AutoGen, LangGraph, WebArena, AppWorld, OSWorld, and governance frameworks, situating CUGA's contributions within the landscape.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper states CUGA 'has been open-sourced for the community' with a GitHub link (https://github.com/cuga-project/cuga-agent) in the abstract footnote.",
    132           "source": "haiku"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "WebArena and AppWorld are public benchmarks, but the novel BPO-TA benchmark (26 tasks over 13 enterprise APIs) is not publicly released, and the enterprise API data is proprietary.",
    138           "source": "haiku"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No requirements file, Dockerfile, or dependency specifications are provided; only the LLM backbone (GPT-4.1) is named in the AppWorld appendix table.",
    144           "source": "haiku"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No step-by-step reproduction instructions are provided; code is open-sourced but the paper includes no instructions for reproducing benchmark or BPO-TA results.",
    150           "source": "haiku"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No confidence intervals or error bars are reported for any result in the paper.",
    158           "source": "haiku"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper explicitly states results were 'not formally tested for statistical significance (Dror et al. 2018, 2020)'.",
    164           "source": "haiku"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Effect sizes with baseline context are reported: valid-first-try rate 79% vs. 62% (ReAct), ablation deltas (-11, -15 points), BPO-TA accuracy 87%.",
    170           "source": "haiku"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "The 26-task BPO-TA benchmark size is never justified with power analysis or minimum detectable difference reasoning.",
    176           "source": "haiku"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No variance, standard deviation, or run-to-run spread is reported for any metric in the paper.",
    182           "source": "haiku"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "A vanilla ReAct baseline (62% valid-first-try rate) is included for BPO-TA; leaderboard competitors are listed for WebArena and AppWorld.",
    190           "source": "haiku"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "WebArena and AppWorld leaderboards include contemporary systems (OpenAI Operator, Jace.AI 2024, GPT-4o-based methods) published in the same period.",
    196           "source": "haiku"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Ablation results are reported: removing reflective retries costs -11 points, removing variable tracking costs -15 reproducibility points on BPO-TA.",
    202           "source": "haiku"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Multiple metrics are used: task accuracy, valid-first-try rate, average latency, provenance log coverage (95%), analyst-reported reproducibility (4.6/5), scenario/task goal completion.",
    208           "source": "haiku"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Analyst-reported reproducibility (4.6/5) and qualitative feedback from BPO architects are included, though informal and not controlled.",
    214           "source": "haiku"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "WebArena and AppWorld use defined held-out test sets; BPO-TA is described as a 'fixed test set' enabling reproducible regression testing.",
    220           "source": "haiku"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "WebArena results broken down by application (Table 1), AppWorld by difficulty level (Table 2), BPO-TA by task category (Table 8, Figure 7).",
    226           "source": "haiku"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Failures are discussed: 'failures concentrated on unsupported cross-application queries where graceful degradation is expected'; BPO-TA includes explicit graceful-failure task categories.",
    232           "source": "haiku"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "The paper is predominantly positive; failure cases are explained away as expected behavior (unsupported queries), and no scenarios where CUGA underperforms relative to expectations are presented.",
    238           "source": "haiku"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "GPT-4.1 is specified only in the AppWorld appendix table (Table 7) but not in the main WebArena results (Table 5) or BPO-TA results (Table 3); key results lack consistent model version disclosure.",
    246           "source": "haiku"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No actual prompts or system instructions are provided; schema-grounded prompting and specification minimization are described conceptually without showing concrete examples.",
    252           "source": "haiku"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": false,
    257           "justification": "Temperature, top-p, context window sizes, and other LLM hyperparameters are not reported anywhere in the paper.",
    258           "source": "haiku"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The layered planner-executor architecture is described in substantial detail (Section 5, Appendix B) with specific named components: TaskAnalyzer, TaskDecomposer, PlanController, API/Browser sub-agents, and their interactions.",
    264           "source": "haiku"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "API schema minimization is described conceptually but preprocessing steps (PII redaction criteria, schema canonicalization rules) are not documented with sufficient detail for reproduction.",
    270           "source": "haiku"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "Enterprise API data and agent interaction logs are proprietary; the BPO-TA task catalog is in the appendix but actual API responses and raw interaction data are unavailable.",
    278           "source": "haiku"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "The 13 read-only APIs, task design principles (traceability, realism, reproducibility), and 26-task taxonomy are described in Section 6.1 and Appendix E with category examples.",
    284           "source": "haiku"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "No formal participant recruitment; analyst feedback comes from IBM BPO team members as part of their regular pilot workflow, not a structured human subjects study.",
    290           "source": "haiku"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The pipeline (API calls → schema validation → provenance logging) is described conceptually but not documented in sufficient detail to reproduce the data flow.",
    296           "source": "haiku"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "GPT-4.1's training data cutoff is never stated; both WebArena (2023) and AppWorld (2024) are public benchmarks potentially present in GPT-4.1's training data.",
    304           "source": "haiku"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether GPT-4.1 may have been trained on WebArena or AppWorld tasks, which were published well before GPT-4.1's training cutoff.",
    310           "source": "haiku"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "WebArena (2023) and AppWorld (2024) are public and could be in GPT-4.1's pretraining data; this potential contamination is not acknowledged or addressed.",
    316           "source": "haiku"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No formal human subjects study; analyst feedback is incidental to the enterprise pilot deployment.",
    324           "source": "haiku"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No formal human subjects study requiring ethics review.",
    330           "source": "haiku"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No formal human subjects study; analyst participants are not described demographically.",
    336           "source": "haiku"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No formal participant selection criteria; IBM BPO team members participated as part of their work duties.",
    342           "source": "haiku"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No experimental human study with randomization.",
    348           "source": "haiku"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No blinding in this non-experimental pilot study.",
    354           "source": "haiku"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No formal human subjects study with attrition to report.",
    360           "source": "haiku"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Average latency per query is reported (11.2s, Table 3); latency is a direct practical cost metric for enterprise deployment.",
    368           "source": "haiku"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No total compute budget, token usage, or monetary cost is stated for running the evaluations or the pilot.",
    374           "source": "haiku"
    375         }
    376       }
    377     }
    378   },
    379   "claims": [
    380     {
    381       "claim": "CUGA achieves state-of-the-art on WebArena with 61.7% accuracy, surpassing OpenAI Operator (58.1%)",
    382       "evidence": "Table 5 leaderboard comparison against published competitors; per-application breakdown in Table 1",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "CUGA achieves state-of-the-art on AppWorld Test-Challenge with 57.6% task goal completion and 48.2% scenario goal completion using GPT-4.1",
    387       "evidence": "Table 7 shows CUGA at 73.2/57.6 (TGC/SGC) vs. next best Chen et al. at 72.6/47.2; model specified",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "CUGA achieves 87% accuracy on BPO-TA benchmark, approaching specialized agent performance",
    392       "evidence": "Table 3 reports 87% task accuracy on 26-task BPO-TA benchmark with no error bars or comparison to a specialized-agent ceiling",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "Generalist agents can reduce enterprise development time by up to 90% and development cost by up to 50% versus task-specific baselines",
    397       "evidence": "Section 7 describes these as 'internal projections and controlled simulations,' not empirically measured outcomes from a controlled study",
    398       "supported": "unsupported"
    399     },
    400     {
    401       "claim": "CUGA reduces average time-to-answer from ~20 minutes (manual) to 2–5 minutes",
    402       "evidence": "Table 4 presents this as a 'preliminary pilot evaluation' from 'controlled test environments and limited analyst feedback,' not production measurement",
    403       "supported": "weak"
    404     },
    405     {
    406       "claim": "Valid-first-try rate improved from 62% (vanilla ReAct baseline) to 79% with full CUGA on BPO-TA",
    407       "evidence": "Reported in Section 6.1 based on 26-task benchmark; no statistical testing or error bars",
    408       "supported": "moderate"
    409     },
    410     {
    411       "claim": "Reflective retries and variable tracking are causally responsible for -11 and -15 point drops respectively when removed",
    412       "evidence": "Ablation study on 26-task BPO-TA benchmark; differences represent ~3–4 tasks with no statistical significance testing",
    413       "supported": "weak"
    414     }
    415   ],
    416   "methodology_tags": [
    417     "benchmark-eval",
    418     "case-study",
    419     "observational"
    420   ],
    421   "key_findings": "CUGA, IBM's hierarchical planner-executor generalist agent, achieves state-of-the-art performance on WebArena (61.7%) and AppWorld Test-Challenge (48.2% scenario completion), validating its design against contemporary specialized systems. In a preliminary enterprise pilot in BPO talent acquisition, CUGA reached 87% accuracy on a 26-task internal benchmark (BPO-TA) with 11.2s average latency and 95% provenance log coverage, while qualitative analyst feedback was positive. Business impact claims (90% development time reduction, 50% cost reduction, 20-min-to-2-min time-to-answer) are derived from internal projections and simulated workflows rather than measured production outcomes, and no statistical significance testing was conducted for any result.",
    422   "red_flags": [
    423     {
    424       "flag": "Self-evaluation bias",
    425       "detail": "IBM employees evaluate IBM's own proprietary system (CUGA) in IBM's own business unit with no independent third-party evaluation."
    426     },
    427     {
    428       "flag": "Business impact figures are projections, not measurements",
    429       "detail": "The 90% development time reduction and 50% cost reduction are described as 'internal projections and controlled simulations' but are prominently featured as contributions alongside measured results."
    430     },
    431     {
    432       "flag": "26-task benchmark insufficient for statistical conclusions",
    433       "detail": "BPO-TA has only 26 tasks; ablation deltas of -11/-15 points represent ~3–4 task differences with no statistical significance testing."
    434     },
    435     {
    436       "flag": "No statistical significance testing (self-acknowledged)",
    437       "detail": "Explicitly acknowledged: 'not formally tested for statistical significance.' All comparative and ablation claims lack statistical rigor."
    438     },
    439     {
    440       "flag": "Single-domain pilot generalized to enterprise readiness",
    441       "detail": "Enterprise readiness conclusions are drawn from one domain (BPO talent acquisition) selected specifically because it matched CUGA's strengths (read-only APIs, structured analytics queries)."
    442     },
    443     {
    444       "flag": "Benchmark contamination not addressed",
    445       "detail": "WebArena (2023) and AppWorld (2024) are public benchmarks and may be present in GPT-4.1's training data; this is neither acknowledged nor discussed."
    446     },
    447     {
    448       "flag": "No variance reported for any metric",
    449       "detail": "No standard deviation, confidence interval, or run-to-run spread is provided for any result, including the key 87% BPO-TA accuracy figure."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents",
    455       "relevance": "Primary benchmark demonstrating CUGA's SOTA performance on multi-application API orchestration tasks"
    456     },
    457     {
    458       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    459       "relevance": "Primary benchmark demonstrating CUGA's SOTA web agent performance"
    460     },
    461     {
    462       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    463       "relevance": "Baseline architecture compared against; described as the common starting point for enterprise agent prototypes that hit scaling limits"
    464     },
    465     {
    466       "title": "ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness in Web Agents",
    467       "relevance": "Related benchmark by same group emphasizing policy adherence and Completion-under-Policy metric for web agents"
    468     },
    469     {
    470       "title": "Towards Enterprise-Ready Computer Using Generalist Agent",
    471       "relevance": "Companion paper (Marreed et al. 2025) describing the CUGA hierarchical architecture in more detail"
    472     },
    473     {
    474       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    475       "relevance": "Related work on reflective retries and verbal self-correction in agents, a key mechanism in CUGA"
    476     },
    477     {
    478       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    479       "relevance": "Related multi-agent orchestration framework positioned alongside CUGA in the enterprise agent landscape"
    480     },
    481     {
    482       "title": "The BrowserGym Ecosystem for Web Agent Research",
    483       "relevance": "Related evaluation platform for web agents under controlled variability, part of the benchmark ecosystem CUGA operates in"
    484     }
    485   ],
    486   "engagement_factors": {
    487     "practical_relevance": {
    488       "score": 3,
    489       "justification": "Directly addresses the enterprise deployment gap with architectural patterns, a domain-specific benchmark, and real pilot experience at IBM BPO scale."
    490     },
    491     "surprise_contrarian": {
    492       "score": 1,
    493       "justification": "The central thesis (generalist agents can work in enterprise settings) aligns with industry trends and is not surprising or counterintuitive."
    494     },
    495     "fear_safety": {
    496       "score": 1,
    497       "justification": "Discusses governance, HITL, and safety requirements for enterprise agents but in a reassuring, problem-solved framing rather than raising concerns."
    498     },
    499     "drama_conflict": {
    500       "score": 1,
    501       "justification": "Implicitly critiques fragmented specialized agent frameworks but does not engage in direct controversy or conflict with other researchers."
    502     },
    503     "demo_ability": {
    504       "score": 2,
    505       "justification": "Code is open-sourced on GitHub and WebArena/AppWorld are reproducible public benchmarks, though the BPO-TA pilot requires proprietary enterprise setup."
    506     },
    507     "brand_recognition": {
    508       "score": 2,
    509       "justification": "IBM is a recognized enterprise brand and IBM Research lends institutional credibility, though IBM is not a top-tier AI research lab in 2025."
    510     }
    511   },
    512   "hn_data": {
    513     "threads": [],
    514     "top_points": 0,
    515     "total_points": 0,
    516     "total_comments": 0
    517   }
    518 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs