scan-v5.json (25701B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Design and Evaluation of an Assisted Programming Interface for Behavior Trees in Robotics", 6 "authors": [ 7 "J. Styrud", 8 "Matteo Iovino", 9 "Rebecca Stower", 10 "Mart Kartašev", 11 "Mikael Norrlöf", 12 "Mårten Björkman", 13 "Christian Smith" 14 ], 15 "year": 2026, 16 "venue": "arXiv", 17 "arxiv_id": "2602.09772", 18 "doi": null 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "The abstract claims BETR-GUI enables better task performance (LMM FULL vs MANUAL: b=61.05, p<.001, Table V) and humans outperform AI alone (Table VI, p<.001); both are directly supported by the reported results.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "The pre-registered ablation design with 6 variants, counterbalanced order, and LMM analysis supports causal attribution of performance differences to specific components within the scope of these tasks.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "The Note to Practitioners makes broad claims about improving performance across 'the robotics industry' and 'uncontrolled environments,' while evidence comes only from 3 simplified toy tasks in a 15-minute lab study.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": true, 43 "justification": "The Discussion proposes multiple explanations for why NO_BO and NO_GP did not significantly outperform FULL, including users failing to utilize node-locking, the planner dominating easy tasks, and learning algorithms needing more time.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "The composite score function is fully defined (Equations 1–4), its normalization is explained, and separate SUS and ranking metrics are used alongside the task score.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": false, 57 "justification": "Limitations are discussed only in the Future Work section (Section VIII) with no dedicated limitations or threats-to-validity section.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": true, 63 "justification": "Specific threats are named: 'benchmark tasks are highly simplified compared to actual robot applications,' the 15-minute window disadvantages learning algorithms, and users had to simultaneously learn BTs and the GUI.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": true, 69 "justification": "The paper explicitly states 'the benchmark tasks, out of necessity, are highly simplified compared to actual robot applications' and calls for future studies with realistic complex tasks.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "Funding from the Wallenberg AI, Autonomous Systems, and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation is disclosed in the acknowledgment.", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are disclosed including ABB Robotics, KTH, ETH Zürich, and Ericsson; ABB Robotics has direct commercial interest in robot programming tools.", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": true, 89 "justification": "WASP/Wallenberg Foundation is an independent academic research program with no commercial stake in BETR-GUI; ABB Robotics authors have an interest but are not the funder.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests statement is present despite two authors being affiliated with ABB Robotics, which could commercially benefit from the tool being evaluated.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "Behavior Trees, the composite score function, all GUI variants, and AI component algorithms (GP, BO, planning, LLM roles) are defined with sufficient precision for the paper's purposes.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "Two explicit contributions are listed in the introduction: (1) the BETR-GUI tool combining multiple AI methods with a GUI, and (2) a 60-participant ablative user study.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section II provides extensive engagement with prior BT, planning, GP, BO, LLM, and composite systems work, explicitly building on specific prior methods that are integrated into BETR-GUI.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "Source code is publicly available on GitHub at https://github.com/jstyrud/BETR-GUI as explicitly stated twice in the paper.", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper states 'Full analyses are available in the OSF repository' but does not explicitly state that raw participant data (scores, SUS responses) is available for independent verification.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "Only Python/PyQt5 and Unity Engine are mentioned without version specifications. No requirements.txt, Dockerfile, or dependency manifest is provided or referenced.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": false, 144 "justification": "No step-by-step reproduction instructions are provided for recreating the experimental setup or running the user study; the paper describes system architecture but not how to reproduce experiments.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": true, 152 "justification": "95% CIs are reported for all LMM fixed effects (Tables IV, VI, VIII), and SD is reported in descriptive statistics for all GUI variants.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": true, 158 "justification": "Linear Mixed Models with Tukey-adjusted post-hoc pairwise comparisons and AICc-based model selection are used throughout with p-values reported.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Pseudo-R² is reported for each model (task R²=0.62, SUS R²=0.21, NO_HUMAN R²=0.04), and mean score differences with baselines provide practical effect sizes.", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": true, 170 "justification": "An a priori power analysis with α=.05 determined that 60 participants provides 80% power assuming small-medium effects; supplementary code to recreate the analysis is noted.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": true, 176 "justification": "Standard deviations are reported for all mean task scores (Table III) and SUS scores (Table VII) across all six GUI variants.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "MANUAL_ONLY (no AI assistance) serves as the primary baseline, representing existing commercial BT GUIs such as Groot.", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "MANUAL_ONLY is described as 'largely similar to existing GUIs like Groot' — the current commercial standard — making it a contemporary and competitive baseline.", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "The entire experimental design is a systematic ablation study with four ablation variants (NO_BO, NO_GP, NO_LLM, NO_PLANNER) each removing one component from the FULL system.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "Three metrics are used: composite task score (performance), System Usability Scale (subjective usability), and participant preference rankings.", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": true, 207 "answer": true, 208 "justification": "The study is a human evaluation with 60 participants solving robot programming tasks and completing usability questionnaires; this is the primary evaluation method.", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": false, 213 "answer": false, 214 "justification": "This is a user study of an interactive tool, not a predictive machine learning task requiring a train/test split.", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Results are broken down by GUI variant, task (Cubes/Tableware/Trashpicking), and trial order with statistical tests for each factor; Figure 8 shows cross-tabulated results.", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "The Discussion addresses NO_LLM/NO_PLANNER failures, users distrusting and abandoning AI suggestions, and specific user quotes describing frustration; node-locking was used in only 74/120 experiments.", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "Key null results are foregrounded: NO_BO and NO_GP not significantly different from FULL (p=.999, p=.807), and NO_LLM and NO_PLANNER not significantly better than MANUAL_ONLY (p=.783, p=.358).", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": false, 240 "justification": "Only 'GPT-4' is stated without a snapshot date or API version (e.g., gpt-4-0613); multiple GPT-4 versions with different capabilities existed during the study period.", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": false, 246 "justification": "Prompts are referenced as 'the same method as in [12] with a slightly updated prompt' but are not provided in the paper or explicitly pointed to in the repository.", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": false, 252 "justification": "Score weights are on GitHub but not in the paper; GP population size, mutation rates, BO acquisition function, and surrogate model parameters are never specified.", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section III.D and Figure 2 clearly describe the full AI assistant workflow: seed BT → planner → LLM error resolution loop → parallel GP/BO optimization with user interaction.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": true, 264 "justification": "Score normalization procedure is defined (0 = minimal failing two-node BT, 100 = best participant score), and GUI logging of actions/scores with timestamps is described.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": false, 272 "justification": "The paper explicitly states only that 'Full analyses are available in the OSF repository'; raw participant score and SUS data are not confirmed as available.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Section V.F describes the full procedure: GUI automatic logging of actions and scores with timestamps, SUS after each variant, demographic questionnaire, and structured one-hour session.", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": true, 283 "answer": true, 284 "justification": "Recruitment via 'flyers, mailing lists, social media, and word of mouth' is explicitly stated along with 100 SEK gift card compensation.", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": true, 290 "justification": "Score calculation equations (1–4) are provided, R/lme4 analysis is described with model selection criteria (AICc), randomization code is on GitHub, and full analyses are on OSF.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": false, 297 "answer": false, 298 "justification": "This study evaluates a human-computer interface using GPT-4 as one component, not benchmarking LLM capabilities on standard datasets where training contamination is a methodological concern.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": false, 303 "answer": false, 304 "justification": "Evaluation tasks are custom Unity simulations with novel parameterized scenarios; train/test overlap with LLM pre-training data is not a relevant concern for this study type.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": false, 309 "answer": false, 310 "justification": "No standard benchmarks are used; all evaluation scenarios were custom-designed for this study and unavailable before the study was conducted.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": true, 317 "answer": true, 318 "justification": "Hypotheses and planned confirmatory and exploratory analyses were pre-registered on OSF at https://osf.io/ax5gb/overview before data collection.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper states the study 'followed the ethical guidelines for Sweden' but does not mention specific IRB or ethics board approval or committee name.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": true, 329 "answer": true, 330 "justification": "Age (M=29.7, SD=8.9, range 20–62), gender (10 female, 50 male), and domain familiarity scores across four domains are reported in Table II.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": true, 335 "answer": false, 336 "justification": "Participants are described post-hoc as 'primarily university students of engineering or computer science or professional software developers,' not as formal pre-specified inclusion/exclusion criteria.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": true, 341 "answer": true, 342 "justification": "Task and variant order were counterbalanced in advance ensuring equal ablation exposure and order effect control; the randomization code is on GitHub.", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": true, 347 "answer": false, 348 "justification": "No blinding is described; participants could see which GUI variant they were using, and the supervisor monitored all sessions without any blinding protocol.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": true, 353 "answer": true, 354 "justification": "One participant was excluded due to a GUI bug and replaced with a new participant; this attrition is explicitly reported with reason.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": false, 362 "justification": "GPT-4 API calls are made during each experiment session but no cost per session, latency, or total API cost is reported.", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": false, 368 "justification": "Hardware is specified (Intel Core Ultra 9 185H CPU) but total compute time, GPU usage, or budget for the 300 NO_HUMAN ablation runs are not reported.", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "BETR-GUI with full AI assistant achieves significantly higher task scores than MANUAL_ONLY (mean 91.14 vs 30.24)", 377 "evidence": "LMM post-hoc FULL vs MANUAL: b=61.05, t=14.79, p<.001 (Table V); effect consistent across all tasks and trial orders with R²=0.62", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Human+AI collaboration (FULL) outperforms AI running alone (NO_HUMAN) given the same time budget", 382 "evidence": "LMM FULL vs NO_HUMAN: b=3.08, t=3.46, p<.001 (Table VI); mean 91.14 vs 88.06 across 60 FULL and 300 NO_HUMAN runs", 383 "supported": "strong" 384 }, 385 { 386 "claim": "LLM and planner are critical components; removing either yields performance not significantly above MANUAL_ONLY", 387 "evidence": "Post-hoc comparisons NO_LLM vs MANUAL: b=-8.83, p=.783; NO_PLANNER vs MANUAL: b=-13.32, p=.358 (Table V) — both non-significant", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Removing Bayesian Optimization or Genetic Programming does not significantly reduce task performance versus FULL", 392 "evidence": "Post-hoc: FULL vs NO_BO p=.999; FULL vs NO_GP p=.807 (Table V); mean score differences of 0.76 and 9.13 points are within noise", 393 "supported": "strong" 394 }, 395 { 396 "claim": "User performance improves significantly across successive trials due to learning", 397 "evidence": "LMM fixed effect of Trial Order: b=6.88, t=3.45, p<.001 (Table IV); ~7 normalized score points gained per trial", 398 "supported": "strong" 399 }, 400 { 401 "claim": "User trust in the AI assistant mediates task performance, with some participants refusing correct AI suggestions", 402 "evidence": "Only 74/120 experiments used node-locking; user quotes describe distrust after bad AI experiences; some users rejected AI solutions that solved the task and then failed manually", 403 "supported": "moderate" 404 } 405 ], 406 "methodology_tags": [ 407 "rct", 408 "qualitative" 409 ], 410 "key_findings": "BETR-GUI combining LLMs, planning, genetic programming, and Bayesian optimization significantly improves novice programmer performance on robot BT tasks versus manual-only interfaces (mean score 91.14 vs 30.24, p<.001). LLM and planner components are essential — removing either produces performance statistically indistinguishable from manual-only — while removing BO or GP has negligible impact in 15-minute sessions, likely because the planner dominates short tasks. Human-AI collaboration outperforms AI alone (p<.001), demonstrating continued human value even with an extensive AI assistant. User trust emerged as a key behavioral mediator: participants who experienced a poor AI suggestion early often abandoned the tool entirely, even when the AI subsequently solved the task correctly.", 411 "red_flags": [ 412 { 413 "flag": "GPT-4 version unspecified", 414 "detail": "Only 'GPT-4' is named without snapshot date or API version ID, making exact reproduction impossible given multiple GPT-4 variants deployed over this period." 415 }, 416 { 417 "flag": "Highly simplified tasks", 418 "detail": "All evaluation scenarios are 15-minute toy tasks (~15-node BTs) in a custom Unity simulation; generalization to real industrial robotics is explicitly unvalidated and stated as future work." 419 }, 420 { 421 "flag": "Unequal ablation sample sizes", 422 "detail": "FULL and MANUAL_ONLY received 60 exposures each while each ablation variant received only ~15, reducing statistical power for detecting differences between ablation variants." 423 }, 424 { 425 "flag": "No competing interests declaration", 426 "detail": "Two authors are from ABB Robotics, which has direct commercial interest in robot programming tools; no competing interests statement is present." 427 }, 428 { 429 "flag": "Hyperparameters not in paper", 430 "detail": "GP population size, mutation rates, BO surrogate model, acquisition function, and score function weights are not reported in the paper and only available via the GitHub repository." 431 }, 432 { 433 "flag": "Male-dominated sample", 434 "detail": "83% of participants identified as male (50/60), limiting generalizability of usability findings to broader populations including industrial operators." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "A survey of Behavior Trees in robotics and AI", 440 "relevance": "Comprehensive survey establishing the state of BT methods in robotics; provides foundational background and taxonomy used throughout the paper" 441 }, 442 { 443 "title": "Automatic behavior tree expansion with LLMs for robotic manipulation", 444 "relevance": "Direct predecessor work (BETR-XP-LLM) that BETR-GUI builds upon for LLM-based BT expansion and error resolution" 445 }, 446 { 447 "title": "Combining Planning and Learning of Behavior Trees for Robotic Assembly", 448 "relevance": "Prior system from same team combining planning and GP for BT creation; BETR-GUI integrates and extends this" 449 }, 450 { 451 "title": "BeBOP: Combining Reactive Planning and Bayesian Optimization to Solve Robotic Manipulation Tasks", 452 "relevance": "Prior work on BO for robot BT optimization that is integrated as a component in BETR-GUI" 453 }, 454 { 455 "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity", 456 "relevance": "Cited as evidence that AI assistants can decrease developer performance — key contrast motivating BETR-GUI's positive result" 457 }, 458 { 459 "title": "Integrating intent understanding and optimal behavior planning for behavior tree generation from human instructions", 460 "relevance": "Contemporary LLM+BT system (LLM-OBTEA) used as prior art and design reference for the AI assistant" 461 }, 462 { 463 "title": "Behavior Trees in Robotics and AI: An Introduction", 464 "relevance": "Reference textbook defining BT semantics and operations; foundational context for the paper's implementation" 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 2, 470 "justification": "Directly applicable to robotics practitioners programming behavior trees; code is released on GitHub with an instruction video." 471 }, 472 "surprise_contrarian": { 473 "score": 1, 474 "justification": "Mildly surprising that BO and GP add no significant value over LLM+planner alone, and that trust mediates performance more than algorithmic capability." 475 }, 476 "fear_safety": { 477 "score": 0, 478 "justification": "No AI safety or risk concerns raised; the paper focuses on productivity in robot programming." 479 }, 480 "drama_conflict": { 481 "score": 1, 482 "justification": "Engages directly with the recent finding that AI assistants can make developers worse, then shows a positive result for the specific domain of robot BT programming." 483 }, 484 "demo_ability": { 485 "score": 2, 486 "justification": "Code is on GitHub with a publicly available 5-minute instruction video; practitioners could realistically try the tool." 487 }, 488 "brand_recognition": { 489 "score": 1, 490 "justification": "ABB Robotics and KTH are well-known in robotics but not broadly recognized AI research labs." 491 } 492 }, 493 "hn_data": { 494 "threads": [], 495 "top_points": 0, 496 "total_points": 0, 497 "total_comments": 0 498 } 499 }