scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30395B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "From Benchmarks to Business Impact: Deploying IBM Generalist Agent in Enterprise Production",
      6     "authors": [
      7       "Segev Shlomov",
      8       "Alon Oved",
      9       "Sami Marreed",
     10       "Ido Levy",
     11       "Offer Akrabi",
     12       "Avi Yaeli",
     13       "Łukasz Strak",
     14       "Elizabeth Koumpan",
     15       "Yinon Goldshtein",
     16       "Eilam Shapira",
     17       "Nir Mashkif",
     18       "Asaf Adi"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv.org",
     22     "arxiv_id": "2510.23856",
     23     "doi": "10.48550/arXiv.2510.23856"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Abstract claims are hedged: 'approached the accuracy of specialized agents,' 'indicating potential,' 'preliminary evaluations.' Results in Tables 1-4 support these hedged claims.",
     31         "source": "opus"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The ablation claims ('reflective retries -11 points,' 'variable tracking -15 reproducibility') are causal but reported without detail on experimental design. The '90% reduction in development time' is stated as an estimate without causal methodology.",
     37         "source": "opus"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper is careful to bound claims: 'preliminary evaluations,' 'pilot-level results,' 'controlled test environments and limited analyst feedback rather than full production deployment' (Section 6.2). Title says 'Enterprise Production' but body consistently hedges.",
     43         "source": "opus"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "No discussion of alternative explanations for the results. The improvement over ReAct baseline could be due to many factors (more compute, better prompts, etc.) but this is not explored.",
     49         "source": "opus"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper measures task accuracy on BPO-TA and frames it as 'enterprise readiness' and 'business value' without discussing the gap between benchmark accuracy on 26 curated tasks and actual business impact in production.",
     55         "source": "opus"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No dedicated limitations section. Some limitations are scattered in Section 7 (Lessons Learned) and Section 6.2, but there is no substantive 'Limitations' or 'Threats to Validity' section.",
     63         "source": "opus"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No specific threats to validity discussed. The paper acknowledges results are 'preliminary' and 'pilot-level' but does not discuss specific threats like selection bias in BPO-TA task design, single-domain evaluation, or potential measurement artifacts.",
     69         "source": "opus"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper explicitly states scope: 'still on its journey toward full production deployment,' 'controlled test environments and limited analyst feedback rather than full production deployment' (Section 6.2), pilot is read-only only. Section 7 outlines 'next steps' implying what was NOT done.",
     75         "source": "opus"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No funding source or acknowledgments section. All authors are IBM Research or IBM Consulting employees, but no explicit funding disclosure.",
     83         "source": "opus"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All authors clearly listed as IBM Research or IBM Consulting with institutional emails.",
     89         "source": "opus"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "IBM employees are evaluating IBM's own CUGA system on IBM's own BPO business. IBM has a direct financial interest in showing CUGA works for enterprise deployment.",
     95         "source": "opus"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement. IBM authors evaluating IBM's product with potential commercial implications — this conflict is not explicitly acknowledged.",
    101         "source": "opus"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms including 'generalist agent,' 'Computer Using Agent (CUA),' 'HITL,' and 'planner-executor architecture' are defined or described within context. The distinction between generalist and specialized agents is explained in Section 4.",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 1 explicitly enumerates five distinct contributions: enterprise pilot experience, domain benchmark (BPO-TA), architectural advances, preliminary business impact metrics, and lessons learned.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 2 engages substantively with prior work including ReAct, CodeAct, AutoGen, LangGraph, WebArena, AppWorld, and related enterprise AI systems, situating CUGA's contributions relative to each category.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The abstract states CUGA has been open-sourced with a GitHub link: https://github.com/cuga-project/cuga-agent (footnote 1).",
    132           "source": "opus"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The BPO-TA benchmark is described in detail (26 tasks, 13 APIs) but no download link or public release is provided. The benchmark data itself is not made available.",
    138           "source": "opus"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No environment specifications, requirements files, or dependency details are provided in the paper.",
    144           "source": "opus"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture but does not give instructions for replicating benchmark results.",
    150           "source": "opus"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "All results are point estimates (e.g., 61.7% WebArena, 87% BPO-TA) with no confidence intervals or error bars.",
    158           "source": "opus"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "The paper explicitly acknowledges results are 'not formally tested for statistical significance (Dror et al. 2018, 2020)' in Section 6.2.",
    164           "source": "opus"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "The paper reports effect sizes with baseline context: '~90% improvement' in time-to-answer (from ~20 min to ~2-5 min), reproducibility improvement from ~60% to ~95%, and provenance coverage from ~40% to ~92% (Table 4).",
    170           "source": "opus"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No justification for why 26 BPO-TA tasks or the specific benchmark sizes are adequate. No power analysis.",
    176           "source": "opus"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "No variance, standard deviation, or spread measures reported for any results. Single-run numbers throughout.",
    182           "source": "opus"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "WebArena and AppWorld results compared against published agents (Table 5: Operator, Jace.AI, ScribeAgent, etc.; Table 7: Chen et al., Gupta et al., ReAct). BPO-TA compared against 'vanilla ReAct baseline' (62% vs 79% valid-first-try).",
    190           "source": "opus"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Baselines include recent systems: OpenAI Operator (2025), Jace.AI (2024), ScribeAgent (2024), and concurrent AppWorld entries.",
    196           "source": "opus"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Section 6.1 mentions ablations: 'reflective retries (-11 points without) and variable tracking (-15 reproducibility without).' However, only two ablation results are reported with minimal detail.",
    202           "source": "opus"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Multiple metrics used: Task Goal Completion and Scenario Goal Completion for AppWorld; accuracy, valid-first-try rate, provenance log rate, latency, and analyst-reported reproducibility for BPO-TA (Table 3).",
    208           "source": "opus"
    209         },
    210         "human_evaluation": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Section 6.3 reports qualitative feedback from BPO architects and analysts. Table 3 includes 'Analyst-Reported Reproducibility' score of 4.6/5. However, this is informal feedback, not a structured user study.",
    214           "source": "opus"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "WebArena and AppWorld are established benchmarks with fixed test sets. AppWorld distinguishes Test-Normal and Test-Challenge.",
    220           "source": "opus"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "WebArena results broken down per application (Table 1: GitLab, Map, Reddit, Shopping, etc.). AppWorld broken down by difficulty level (Table 2). BPO-TA task categories described (Section 6.1).",
    226           "source": "opus"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Section 6.1 notes 'failures concentrated on unsupported cross-application queries where graceful degradation is expected.' The BPO-TA benchmark explicitly includes graceful failure tasks.",
    232           "source": "opus"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Multi-App performance on WebArena is notably low (35.4% vs 61.7% overall). The paper acknowledges the system is 'still on its journey toward full production deployment' and discusses limitations of early architectures (Section 3.1).",
    238           "source": "opus"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "AppWorld results use 'GPT-4.1' (Table 7) but no snapshot date or API version. WebArena model not specified in the paper. No version details for the LLMs used in CUGA's various sub-agents.",
    246           "source": "opus"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "The paper describes 'schema-grounded prompting' and prompt design principles but does not provide actual prompt text used in experiments.",
    252           "source": "opus"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": false,
    257           "justification": "No hyperparameters reported: no temperature, top-p, max tokens, or other LLM settings mentioned anywhere.",
    258           "source": "opus"
    259         },
    260         "scaffolding_described": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The architecture is described in detail: hierarchical planner-executor loops, chat layer, outer/inner loops, API Sub Agent, Browser Sub Agent, Plan Controller, ShortlisterAgent, CodeAgent, etc. (Section 5, Figures 2-4, Appendix A-B).",
    264           "source": "opus"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "The BPO-TA benchmark is described at a high level (task categories, API endpoints) but the data preprocessing steps, how tasks were curated, and how gold-standard answers were created are not documented.",
    270           "source": "opus"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": false,
    277           "justification": "No raw data available. BPO-TA benchmark data, detailed per-task results, and analyst feedback data are not released.",
    278           "source": "opus"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "BPO-TA benchmark design is described: 26 tasks across 13 read-only APIs, task categories (lookup, join, looped reasoning, provenance, graceful failure), drawn from analyst practice (Section 6.1, Table 9).",
    284           "source": "opus"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The paper mentions 'recruiters and analysts' provided feedback but does not describe how many, how they were selected, or the feedback collection process.",
    290           "source": "opus"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No documentation of the pipeline from task creation to benchmark evaluation. How gold-standard answers were generated, how accuracy was scored, and how the 26 tasks were selected from possible tasks is not described.",
    296           "source": "opus"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No training data cutoff stated for GPT-4.1 or any models used in CUGA's pipeline.",
    304           "source": "opus"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether WebArena or AppWorld tasks could appear in the training data of the underlying LLMs.",
    310           "source": "opus"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "WebArena was published in 2023 and AppWorld in 2024. Models trained after these dates may have seen benchmark content. This contamination risk is not addressed.",
    316           "source": "opus"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "The qualitative feedback from analysts is informal and does not constitute a human subjects study requiring pre-registration.",
    324           "source": "opus"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No formal human subjects study conducted. Informal analyst feedback does not require IRB.",
    330           "source": "opus"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No formal human participants study.",
    336           "source": "opus"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No formal human participants study.",
    342           "source": "opus"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No formal human participants study.",
    348           "source": "opus"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No formal human participants study.",
    354           "source": "opus"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No formal human participants study.",
    360           "source": "opus"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Table 3 reports 'Average Latency per Query: 11.2s' for BPO-TA. Table 2 reports average interactions per task level for AppWorld. However, no monetary cost is reported.",
    368           "source": "opus"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No total compute budget stated — no GPU hours, API costs, or total spend for running the benchmarks or pilot.",
    374           "source": "opus"
    375         }
    376       },
    377       "experimental_rigor": {
    378         "seed_sensitivity_reported": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "No mention of multiple random seeds. All results appear to be single-run.",
    382           "source": "opus"
    383         },
    384         "number_of_runs_stated": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "Number of experimental runs not stated for any benchmark evaluation.",
    388           "source": "opus"
    389         },
    390         "hyperparameter_search_budget": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "No hyperparameter search budget reported despite the system having many configurable components.",
    394           "source": "opus"
    395         },
    396         "best_config_selection_justified": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "No discussion of how the final system configuration was selected or how many configurations were tried.",
    400           "source": "opus"
    401         },
    402         "multiple_comparison_correction": {
    403           "applies": false,
    404           "answer": false,
    405           "justification": "No statistical tests performed, so multiple comparison correction is not applicable.",
    406           "source": "opus"
    407         },
    408         "self_comparison_bias_addressed": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "IBM authors evaluate their own CUGA system against baselines without acknowledging self-evaluation bias. Lucic et al. (2018) concern applies directly.",
    412           "source": "opus"
    413         },
    414         "compute_budget_vs_performance": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "No comparison of compute budgets between CUGA and baselines. CUGA's hierarchical multi-agent architecture likely uses substantially more compute than simpler baselines, but this is not quantified or discussed.",
    418           "source": "opus"
    419         },
    420         "benchmark_construct_validity": {
    421           "applies": true,
    422           "answer": false,
    423           "justification": "No discussion of whether WebArena and AppWorld actually measure enterprise readiness. The paper argues enterprise deployment requires different evaluation but does not question the construct validity of the academic benchmarks it uses for SOTA claims.",
    424           "source": "opus"
    425         },
    426         "scaffold_confound_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "CUGA is a complex scaffold compared to baselines (ReAct, etc.). The paper does not separate model capability from scaffold contribution. Comparisons in Tables 5 and 7 mix different scaffolds.",
    430           "source": "opus"
    431         }
    432       },
    433       "data_leakage": {
    434         "temporal_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of temporal leakage. WebArena (2023) and AppWorld (2024) existed before model training — potential solutions could be in training data.",
    438           "source": "opus"
    439         },
    440         "feature_leakage_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether the evaluation setup provides information not available in real deployment.",
    444           "source": "opus"
    445         },
    446         "non_independence_addressed": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No discussion of independence between training data and benchmark content.",
    450           "source": "opus"
    451         },
    452         "leakage_detection_method": {
    453           "applies": true,
    454           "answer": false,
    455           "justification": "No leakage detection or prevention methods applied.",
    456           "source": "opus"
    457         }
    458       }
    459     }
    460   },
    461   "claims": [
    462     {
    463       "claim": "CUGA achieves state-of-the-art among published agents on WebArena (61.7%) and AppWorld Test-Challenge (48.2% scenario completion).",
    464       "evidence": "Tables 5 and 7 compare CUGA against contemporary published systems including OpenAI Operator (58.1%) and Chen et al. (2025) on official leaderboards.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "CUGA achieved 87% accuracy on the BPO-TA benchmark, approaching the accuracy of specialized hand-crafted agents.",
    469       "evidence": "Table 3 reports 87% task accuracy and 78% valid-first-try rate. However, BPO-TA was created by the same team that built CUGA, and no specialized agent baseline is directly compared on this benchmark.",
    470       "supported": "moderate"
    471     },
    472     {
    473       "claim": "Reflective retries improve valid-first-try rate by 11 points and variable tracking improves reproducibility by 15 points.",
    474       "evidence": "Section 6.1 mentions these as ablation results but no ablation table, run details, or confidence intervals accompany the figures.",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "CUGA reduces average time-to-answer from ~20 minutes (manual) to 2–5 minutes, an ~90% improvement.",
    479       "evidence": "Table 4 and Section 6.2 report this figure but explicitly acknowledge it is from 'controlled test environments and limited analyst feedback rather than full production deployment' and 'not formally tested for statistical significance.'",
    480       "supported": "weak"
    481     },
    482     {
    483       "claim": "CUGA reduces development time by ~90% and development cost by ~50% compared to specialized agents.",
    484       "evidence": "Section 7 reports these figures as 'internal projections and controlled simulations.' No methodology for computing these estimates, no baseline measurements, and no statistical validation are provided.",
    485       "supported": "unsupported"
    486     },
    487     {
    488       "claim": "CUGA's API/Tool Hub reduces new endpoint onboarding time from weeks to hours.",
    489       "evidence": "Section 5 describes this as a design benefit qualitatively. No empirical measurement of onboarding time with or without the hub is presented.",
    490       "supported": "weak"
    491     }
    492   ],
    493   "methodology_tags": [
    494     "benchmark-eval",
    495     "case-study"
    496   ],
    497   "key_findings": "CUGA achieves leading published performance on WebArena (61.7%) and AppWorld Test-Challenge (48.2%) using a hierarchical planner-executor architecture with schema-grounded prompting and reflective retries. In a preliminary enterprise pilot in IBM's BPO talent acquisition domain using a 26-task proprietary benchmark (BPO-TA) created by the same team, CUGA scored 87% accuracy and reduced query latency to 11.2 seconds. Claimed business impact figures (90% time reduction, 50% cost reduction) are based on simulated workflows and internal projections without statistical validation. The paper's primary contribution is architectural design principles and organizational lessons for bridging benchmark performance and enterprise deployment, rather than formally validated production impact.",
    498   "red_flags": [
    499     {
    500       "flag": "Self-evaluation bias",
    501       "detail": "IBM researchers evaluate their own IBM product (CUGA) on a proprietary benchmark (BPO-TA) they created themselves, deployed in IBM's own business unit, evaluated by IBM employees. No independent third-party evaluation is conducted."
    502     },
    503     {
    504       "flag": "No statistical testing (self-disclosed)",
    505       "detail": "Section 6.2 explicitly acknowledges that results are 'not formally tested for statistical significance.' Despite this, causal and comparative claims with precise percentage figures are presented throughout."
    506     },
    507     {
    508       "flag": "Business impact from simulations only",
    509       "detail": "The 90% time reduction and 50% cost reduction figures come from 'internal projections and controlled simulations,' not production deployment. These figures appear in the abstract and contributions list without adequate hedging."
    510     },
    511     {
    512       "flag": "BPO-TA benchmark created by evaluators",
    513       "detail": "The 26-task domain benchmark used to measure enterprise performance was designed by the same team that built CUGA. This introduces significant risk of benchmark overfitting and favorable task selection."
    514     },
    515     {
    516       "flag": "Unblinded analyst evaluation from same organization",
    517       "detail": "The Analyst-Reported Reproducibility score (4.6/5) and qualitative feedback come from IBM BPO employees evaluating an IBM product under no blinding, with no description of how many participants were involved."
    518     },
    519     {
    520       "flag": "Contamination not addressed",
    521       "detail": "GPT-4.1 is used for WebArena and AppWorld evaluations. Both are public benchmarks released before GPT-4.1's likely training cutoff, but no contamination analysis is provided."
    522     },
    523     {
    524       "flag": "Tiny benchmark, 26 tasks",
    525       "detail": "BPO-TA contains only 26 tasks, too small to draw reliable conclusions about enterprise performance or support ablation claims. No power analysis or justification for this sample size is provided."
    526     }
    527   ],
    528   "cited_papers": [
    529     {
    530       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    531       "relevance": "Primary benchmark for CUGA evaluation; foundational benchmark for web agent research"
    532     },
    533     {
    534       "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents",
    535       "relevance": "Second primary benchmark for CUGA evaluation; multi-application orchestration benchmark"
    536     },
    537     {
    538       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    539       "relevance": "Foundational agentic paradigm used as baseline comparison and architectural reference"
    540     },
    541     {
    542       "title": "Reflexion: Language agents with verbal reinforcement learning",
    543       "relevance": "Prior work on reflective retries in agentic systems, directly relevant to CUGA's reliability mechanisms"
    544     },
    545     {
    546       "title": "ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness in Web Agents",
    547       "relevance": "Directly related safety-focused benchmark from same research group (Levy et al., several shared authors)"
    548     },
    549     {
    550       "title": "Tau-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    551       "relevance": "Related benchmark evaluating tool-agent-user dynamics in realistic settings, used for evaluation context"
    552     },
    553     {
    554       "title": "Magentic-One: A Generalist Multi-Agent System for Solving Complex Tasks",
    555       "relevance": "Contemporary generalist agent from Microsoft, directly comparable system architecture"
    556     },
    557     {
    558       "title": "Towards Enterprise-Ready Computer Using Generalist Agent",
    559       "relevance": "Companion architecture paper (Marreed et al. 2025) describing CUGA's full design in more detail"
    560     }
    561   ],
    562   "engagement_factors": {
    563     "practical_relevance": {
    564       "score": 3,
    565       "justification": "Directly addresses enterprise deployment of AI agents with specific architectural patterns, governance requirements, and organizational lessons applicable to practitioners moving agents to production."
    566     },
    567     "surprise_contrarian": {
    568       "score": 1,
    569       "justification": "The main thesis — that generalist agents need adaptation for enterprise — is expected. No surprising or counter-intuitive empirical findings are presented."
    570     },
    571     "fear_safety": {
    572       "score": 1,
    573       "justification": "Addresses enterprise safety, HITL, and governance as engineering requirements, but frames these as solved challenges rather than raising unresolved safety concerns."
    574     },
    575     "drama_conflict": {
    576       "score": 1,
    577       "justification": "The 'benchmarks vs business impact' framing creates mild tension, but the paper presents IBM's own system favorably throughout without genuine controversy."
    578     },
    579     "demo_ability": {
    580       "score": 2,
    581       "justification": "CUGA is open-sourced on GitHub and WebArena/AppWorld are public benchmarks, allowing practitioners to evaluate the system, though enterprise BPO context requires significant proprietary setup."
    582     },
    583     "brand_recognition": {
    584       "score": 3,
    585       "justification": "IBM is a major enterprise AI and consulting company with a large installed base. IBM Research has high credibility in the enterprise AI space, and the IBM Consulting deployment context adds commercial weight."
    586     }
    587   },
    588   "hn_data": {
    589     "threads": [],
    590     "top_points": 0,
    591     "total_points": 0,
    592     "total_comments": 0
    593   }
    594 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs