scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25701B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Design and Evaluation of an Assisted Programming Interface for Behavior Trees in Robotics",
      6     "authors": [
      7       "J. Styrud",
      8       "Matteo Iovino",
      9       "Rebecca Stower",
     10       "Mart Kartašev",
     11       "Mikael Norrlöf",
     12       "Mårten Björkman",
     13       "Christian Smith"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv",
     17     "arxiv_id": "2602.09772",
     18     "doi": null
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract claims BETR-GUI enables better task performance (LMM FULL vs MANUAL: b=61.05, p<.001, Table V) and humans outperform AI alone (Table VI, p<.001); both are directly supported by the reported results.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The pre-registered ablation design with 6 variants, counterbalanced order, and LMM analysis supports causal attribution of performance differences to specific components within the scope of these tasks.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The Note to Practitioners makes broad claims about improving performance across 'the robotics industry' and 'uncontrolled environments,' while evidence comes only from 3 simplified toy tasks in a 15-minute lab study.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The Discussion proposes multiple explanations for why NO_BO and NO_GP did not significantly outperform FULL, including users failing to utilize node-locking, the planner dominating easy tasks, and learning algorithms needing more time.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The composite score function is fully defined (Equations 1–4), its normalization is explained, and separate SUS and ranking metrics are used alongside the task score.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Limitations are discussed only in the Future Work section (Section VIII) with no dedicated limitations or threats-to-validity section.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Specific threats are named: 'benchmark tasks are highly simplified compared to actual robot applications,' the 15-minute window disadvantages learning algorithms, and users had to simultaneously learn BTs and the GUI.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper explicitly states 'the benchmark tasks, out of necessity, are highly simplified compared to actual robot applications' and calls for future studies with realistic complex tasks.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Funding from the Wallenberg AI, Autonomous Systems, and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation is disclosed in the acknowledgment.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations are disclosed including ABB Robotics, KTH, ETH Zürich, and Ericsson; ABB Robotics has direct commercial interest in robot programming tools.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "WASP/Wallenberg Foundation is an independent academic research program with no commercial stake in BETR-GUI; ABB Robotics authors have an interest but are not the funder.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement is present despite two authors being affiliated with ABB Robotics, which could commercially benefit from the tool being evaluated.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Behavior Trees, the composite score function, all GUI variants, and AI component algorithms (GP, BO, planning, LLM roles) are defined with sufficient precision for the paper's purposes.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Two explicit contributions are listed in the introduction: (1) the BETR-GUI tool combining multiple AI methods with a GUI, and (2) a 60-participant ablative user study.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section II provides extensive engagement with prior BT, planning, GP, BO, LLM, and composite systems work, explicitly building on specific prior methods that are integrated into BETR-GUI.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Source code is publicly available on GitHub at https://github.com/jstyrud/BETR-GUI as explicitly stated twice in the paper.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The paper states 'Full analyses are available in the OSF repository' but does not explicitly state that raw participant data (scores, SUS responses) is available for independent verification.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Only Python/PyQt5 and Unity Engine are mentioned without version specifications. No requirements.txt, Dockerfile, or dependency manifest is provided or referenced.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are provided for recreating the experimental setup or running the user study; the paper describes system architecture but not how to reproduce experiments.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "95% CIs are reported for all LMM fixed effects (Tables IV, VI, VIII), and SD is reported in descriptive statistics for all GUI variants.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Linear Mixed Models with Tukey-adjusted post-hoc pairwise comparisons and AICc-based model selection are used throughout with p-values reported.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Pseudo-R² is reported for each model (task R²=0.62, SUS R²=0.21, NO_HUMAN R²=0.04), and mean score differences with baselines provide practical effect sizes.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "An a priori power analysis with α=.05 determined that 60 participants provides 80% power assuming small-medium effects; supplementary code to recreate the analysis is noted.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Standard deviations are reported for all mean task scores (Table III) and SUS scores (Table VII) across all six GUI variants.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "MANUAL_ONLY (no AI assistance) serves as the primary baseline, representing existing commercial BT GUIs such as Groot.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "MANUAL_ONLY is described as 'largely similar to existing GUIs like Groot' — the current commercial standard — making it a contemporary and competitive baseline.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The entire experimental design is a systematic ablation study with four ablation variants (NO_BO, NO_GP, NO_LLM, NO_PLANNER) each removing one component from the FULL system.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Three metrics are used: composite task score (performance), System Usability Scale (subjective usability), and participant preference rankings.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "The study is a human evaluation with 60 participants solving robot programming tasks and completing usability questionnaires; this is the primary evaluation method.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": false,
    213           "answer": false,
    214           "justification": "This is a user study of an interactive tool, not a predictive machine learning task requiring a train/test split.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by GUI variant, task (Cubes/Tableware/Trashpicking), and trial order with statistical tests for each factor; Figure 8 shows cross-tabulated results.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The Discussion addresses NO_LLM/NO_PLANNER failures, users distrusting and abandoning AI suggestions, and specific user quotes describing frustration; node-locking was used in only 74/120 experiments.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Key null results are foregrounded: NO_BO and NO_GP not significantly different from FULL (p=.999, p=.807), and NO_LLM and NO_PLANNER not significantly better than MANUAL_ONLY (p=.783, p=.358).",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Only 'GPT-4' is stated without a snapshot date or API version (e.g., gpt-4-0613); multiple GPT-4 versions with different capabilities existed during the study period.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Prompts are referenced as 'the same method as in [12] with a slightly updated prompt' but are not provided in the paper or explicitly pointed to in the repository.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "Score weights are on GitHub but not in the paper; GP population size, mutation rates, BO acquisition function, and surrogate model parameters are never specified.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section III.D and Figure 2 clearly describe the full AI assistant workflow: seed BT → planner → LLM error resolution loop → parallel GP/BO optimization with user interaction.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Score normalization procedure is defined (0 = minimal failing two-node BT, 100 = best participant score), and GUI logging of actions/scores with timestamps is described.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "The paper explicitly states only that 'Full analyses are available in the OSF repository'; raw participant score and SUS data are not confirmed as available.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Section V.F describes the full procedure: GUI automatic logging of actions and scores with timestamps, SUS after each variant, demographic questionnaire, and structured one-hour session.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Recruitment via 'flyers, mailing lists, social media, and word of mouth' is explicitly stated along with 100 SEK gift card compensation.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "Score calculation equations (1–4) are provided, R/lme4 analysis is described with model selection criteria (AICc), randomization code is on GitHub, and full analyses are on OSF.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "This study evaluates a human-computer interface using GPT-4 as one component, not benchmarking LLM capabilities on standard datasets where training contamination is a methodological concern.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Evaluation tasks are custom Unity simulations with novel parameterized scenarios; train/test overlap with LLM pre-training data is not a relevant concern for this study type.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "No standard benchmarks are used; all evaluation scenarios were custom-designed for this study and unavailable before the study was conducted.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": true,
    317           "answer": true,
    318           "justification": "Hypotheses and planned confirmatory and exploratory analyses were pre-registered on OSF at https://osf.io/ax5gb/overview before data collection.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "The paper states the study 'followed the ethical guidelines for Sweden' but does not mention specific IRB or ethics board approval or committee name.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": true,
    329           "answer": true,
    330           "justification": "Age (M=29.7, SD=8.9, range 20–62), gender (10 female, 50 male), and domain familiarity scores across four domains are reported in Table II.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": true,
    335           "answer": false,
    336           "justification": "Participants are described post-hoc as 'primarily university students of engineering or computer science or professional software developers,' not as formal pre-specified inclusion/exclusion criteria.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": true,
    341           "answer": true,
    342           "justification": "Task and variant order were counterbalanced in advance ensuring equal ablation exposure and order effect control; the randomization code is on GitHub.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": true,
    347           "answer": false,
    348           "justification": "No blinding is described; participants could see which GUI variant they were using, and the supervisor monitored all sessions without any blinding protocol.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": true,
    353           "answer": true,
    354           "justification": "One participant was excluded due to a GUI bug and replaced with a new participant; this attrition is explicitly reported with reason.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "GPT-4 API calls are made during each experiment session but no cost per session, latency, or total API cost is reported.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware is specified (Intel Core Ultra 9 185H CPU) but total compute time, GPU usage, or budget for the 300 NO_HUMAN ablation runs are not reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "BETR-GUI with full AI assistant achieves significantly higher task scores than MANUAL_ONLY (mean 91.14 vs 30.24)",
    377       "evidence": "LMM post-hoc FULL vs MANUAL: b=61.05, t=14.79, p<.001 (Table V); effect consistent across all tasks and trial orders with R²=0.62",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Human+AI collaboration (FULL) outperforms AI running alone (NO_HUMAN) given the same time budget",
    382       "evidence": "LMM FULL vs NO_HUMAN: b=3.08, t=3.46, p<.001 (Table VI); mean 91.14 vs 88.06 across 60 FULL and 300 NO_HUMAN runs",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "LLM and planner are critical components; removing either yields performance not significantly above MANUAL_ONLY",
    387       "evidence": "Post-hoc comparisons NO_LLM vs MANUAL: b=-8.83, p=.783; NO_PLANNER vs MANUAL: b=-13.32, p=.358 (Table V) — both non-significant",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Removing Bayesian Optimization or Genetic Programming does not significantly reduce task performance versus FULL",
    392       "evidence": "Post-hoc: FULL vs NO_BO p=.999; FULL vs NO_GP p=.807 (Table V); mean score differences of 0.76 and 9.13 points are within noise",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "User performance improves significantly across successive trials due to learning",
    397       "evidence": "LMM fixed effect of Trial Order: b=6.88, t=3.45, p<.001 (Table IV); ~7 normalized score points gained per trial",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "User trust in the AI assistant mediates task performance, with some participants refusing correct AI suggestions",
    402       "evidence": "Only 74/120 experiments used node-locking; user quotes describe distrust after bad AI experiences; some users rejected AI solutions that solved the task and then failed manually",
    403       "supported": "moderate"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "rct",
    408     "qualitative"
    409   ],
    410   "key_findings": "BETR-GUI combining LLMs, planning, genetic programming, and Bayesian optimization significantly improves novice programmer performance on robot BT tasks versus manual-only interfaces (mean score 91.14 vs 30.24, p<.001). LLM and planner components are essential — removing either produces performance statistically indistinguishable from manual-only — while removing BO or GP has negligible impact in 15-minute sessions, likely because the planner dominates short tasks. Human-AI collaboration outperforms AI alone (p<.001), demonstrating continued human value even with an extensive AI assistant. User trust emerged as a key behavioral mediator: participants who experienced a poor AI suggestion early often abandoned the tool entirely, even when the AI subsequently solved the task correctly.",
    411   "red_flags": [
    412     {
    413       "flag": "GPT-4 version unspecified",
    414       "detail": "Only 'GPT-4' is named without snapshot date or API version ID, making exact reproduction impossible given multiple GPT-4 variants deployed over this period."
    415     },
    416     {
    417       "flag": "Highly simplified tasks",
    418       "detail": "All evaluation scenarios are 15-minute toy tasks (~15-node BTs) in a custom Unity simulation; generalization to real industrial robotics is explicitly unvalidated and stated as future work."
    419     },
    420     {
    421       "flag": "Unequal ablation sample sizes",
    422       "detail": "FULL and MANUAL_ONLY received 60 exposures each while each ablation variant received only ~15, reducing statistical power for detecting differences between ablation variants."
    423     },
    424     {
    425       "flag": "No competing interests declaration",
    426       "detail": "Two authors are from ABB Robotics, which has direct commercial interest in robot programming tools; no competing interests statement is present."
    427     },
    428     {
    429       "flag": "Hyperparameters not in paper",
    430       "detail": "GP population size, mutation rates, BO surrogate model, acquisition function, and score function weights are not reported in the paper and only available via the GitHub repository."
    431     },
    432     {
    433       "flag": "Male-dominated sample",
    434       "detail": "83% of participants identified as male (50/60), limiting generalizability of usability findings to broader populations including industrial operators."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "A survey of Behavior Trees in robotics and AI",
    440       "relevance": "Comprehensive survey establishing the state of BT methods in robotics; provides foundational background and taxonomy used throughout the paper"
    441     },
    442     {
    443       "title": "Automatic behavior tree expansion with LLMs for robotic manipulation",
    444       "relevance": "Direct predecessor work (BETR-XP-LLM) that BETR-GUI builds upon for LLM-based BT expansion and error resolution"
    445     },
    446     {
    447       "title": "Combining Planning and Learning of Behavior Trees for Robotic Assembly",
    448       "relevance": "Prior system from same team combining planning and GP for BT creation; BETR-GUI integrates and extends this"
    449     },
    450     {
    451       "title": "BeBOP: Combining Reactive Planning and Bayesian Optimization to Solve Robotic Manipulation Tasks",
    452       "relevance": "Prior work on BO for robot BT optimization that is integrated as a component in BETR-GUI"
    453     },
    454     {
    455       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    456       "relevance": "Cited as evidence that AI assistants can decrease developer performance — key contrast motivating BETR-GUI's positive result"
    457     },
    458     {
    459       "title": "Integrating intent understanding and optimal behavior planning for behavior tree generation from human instructions",
    460       "relevance": "Contemporary LLM+BT system (LLM-OBTEA) used as prior art and design reference for the AI assistant"
    461     },
    462     {
    463       "title": "Behavior Trees in Robotics and AI: An Introduction",
    464       "relevance": "Reference textbook defining BT semantics and operations; foundational context for the paper's implementation"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 2,
    470       "justification": "Directly applicable to robotics practitioners programming behavior trees; code is released on GitHub with an instruction video."
    471     },
    472     "surprise_contrarian": {
    473       "score": 1,
    474       "justification": "Mildly surprising that BO and GP add no significant value over LLM+planner alone, and that trust mediates performance more than algorithmic capability."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "No AI safety or risk concerns raised; the paper focuses on productivity in robot programming."
    479     },
    480     "drama_conflict": {
    481       "score": 1,
    482       "justification": "Engages directly with the recent finding that AI assistants can make developers worse, then shows a positive result for the specific domain of robot BT programming."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "Code is on GitHub with a publicly available 5-minute instruction video; practitioners could realistically try the tool."
    487     },
    488     "brand_recognition": {
    489       "score": 1,
    490       "justification": "ABB Robotics and KTH are well-known in robotics but not broadly recognized AI research labs."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [],
    495     "top_points": 0,
    496     "total_points": 0,
    497     "total_comments": 0
    498   }
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs