scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23208B)
      1 {
      2   "paper": {
      3     "title": "Design and Evaluation of an Assisted Programming Interface for Behavior Trees in Robotics",
      4     "authors": [
      5       "Jonathan Styrud",
      6       "Matteo Iovino",
      7       "Rebecca Stower",
      8       "Mart Kartašev",
      9       "Mikael Norrlöf",
     10       "Mårten Björkman",
     11       "Christian Smith"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.09772"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [],
     19   "methodology_tags": [
     20     "rct"
     21   ],
     22   "key_findings": "BETR-GUI, combining LLMs, planning, genetic programming, and Bayesian optimization with a drag-and-drop editor, enables users to perform significantly better at robot programming tasks than manual-only programming. Ablations show that the planner and LLM are the critical components (removing either eliminates the advantage over manual), while removing GP or BO alone does not significantly degrade performance. Humans using the full system significantly outperform the AI assistant running alone (91.1 vs 88.1 mean score).",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "GitHub repository URL provided: https://github.com/jstyrud/BETR-GUI, referenced in Section III and footnotes."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Experimental data uploaded to the OSF repository (https://osf.io/ax5gb/overview), referenced in Sections V-D and VI."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions Python/PyQt5 and Unity but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub repo is referenced but no README instructions or reproduction guide is detailed in the paper itself."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "95% confidence intervals are reported in Tables IV, VI, and VIII for all fixed effects (e.g., 'Trial Order: 95% CI [3.0, 10.8]')."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Linear mixed models with F-tests and t-tests are used throughout (Section VI). Post-hoc pairwise comparisons with Tukey adjustment for multiple comparisons are reported in Tables V and IX. Likelihood ratio test used for rankings (χ²(5) = 404.2, p < .001)."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Pseudo R² values reported for models (R² = 0.62 for task performance, R² = 0.21 for SUS, R² = 0.04 for FULL vs NO_HUMAN). Regression coefficients (b values) with standard errors provided in all tables, giving effect magnitudes in the original scale."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section V-E: 'The number of participants needed was determined a priori using a simulated power analysis with α = .05, suggesting 60 participants are sufficient to reach 80% power, assuming small-medium effects.'"
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Standard deviations reported for all GUI variant scores (Table III, e.g., 'FULL: 91.14 (9.07)'), SUS scores (Table VII), and participant demographics (Table II)."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "MANUAL_ONLY variant serves as baseline, described as 'largely similar to existing GUIs like Groot in capability' (Section V-A). NO_HUMAN ablation also serves as a baseline."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The MANUAL_ONLY baseline is compared to Groot (2025), a contemporary commercial BT editor. The AI components build on recent prior work (2024-2025 references)."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Comprehensive ablation with 6 variants: FULL, MANUAL_ONLY, NO_BO, NO_GP, NO_LLM, NO_PLANNER (Section V-A). Each ablation removes one component to measure its contribution."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Three metrics used: task performance score (Section V-B), System Usability Scale (Section V-F), and preference rankings (Section VI-D)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The entire study IS a human evaluation — 60 participants used the system and provided SUS scores and rankings."
     98       },
     99       "held_out_test_set": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "Not applicable — this is a user study, not a model evaluation on a dataset split."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results broken down by GUI variant (Tables III, V, VII, IX), trial number (Figures 8, 9), and task type (tested as fixed effect, found non-significant)."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section VII discusses failure modes: users not using locking functionality (74/120), users not trusting AI suggestions, and user quotes describing negative experiences with the AI assistant."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative results reported: NO_BO and NO_GP ablations did not significantly differ from FULL; NO_LLM and NO_PLANNER did not significantly outperform MANUAL_ONLY. Hypotheses 1 and 2 only 'partially supported.'"
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims that BETR-GUI 'enables users to perform better' and 'humans using the full variant perform better than the AI assistant running on its own' are both supported by the statistical results in Section VI (Tables V and VI)."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Causal claims ('enables users to perform better') are justified by the RCT design with randomized assignment, counterbalanced conditions, and controlled experiments (Section V-B). Mixed-model analysis accounts for individual differences."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section VIII acknowledges tasks are 'highly simplified compared to actual robot applications' but the title and abstract frame results broadly as 'Assisted Programming Interface for Behavior Trees in Robotics' without bounding to the specific simplified scenarios and participant population tested."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section VII discusses multiple alternative explanations: users not understanding how to use AI features, planner solving most of the task leaving little for learning algorithms, short experiment time disadvantaging learning methods, and trust issues with the AI assistant."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": false,
    144         "justification": "The score function (Eq. 1-4) is a composite of goal fulfillment, tree complexity, and energy penalties with manually-set weights. The paper does not discuss whether this proxy score reflects actual task quality in real robot programming. The gap between this artificial score and real-world robot programming effectiveness is not acknowledged."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section III states 'The LLM model used in all experiments was GPT-4' without specifying a version (e.g., gpt-4-0613) or snapshot date."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The paper references a 'slightly updated prompt' from prior work [12] but does not provide the actual prompt text. The method is described in natural language without the prompts themselves."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No temperature, top-p, or other LLM API parameters are stated. GP and BO hyperparameters are referenced as being available in the GitHub repo but not stated in the paper. Score weight values are also deferred to the repository."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The AI assistant pipeline is described in detail in Section III-D and Figure 2: seed BT → planner expansion → LLM error resolution → GP/BO optimization loop. Node locking, simulation evaluation, and user interaction points are all documented."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Score normalization procedure described in Section VI-A. Experimental randomization and counterbalancing described in Section V-B. One participant excluded due to a bug and replaced (Section V-E)."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section VIII (Future Work) opens with 'The main limitation of this work is that the benchmark tasks, out of necessity, are highly simplified compared to actual robot applications' and continues with substantive discussion."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Specific threats discussed: simplified tasks vs. real applications, short experiment time disadvantaging learning algorithms, users not understanding how to use node locking, and trust issues with the AI. Section VII discusses these specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section VIII explicitly states that tasks are simplified, more time and training would be needed for realistic tasks, and that component importance depends on other components in the system."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data uploaded to OSF repository (https://osf.io/ax5gb/overview). Full analyses and supplementary materials referenced as available there."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section V-F describes the full procedure: consent form, demographic questionnaire, 5-minute instruction video, 5-minute familiarization, three 15-minute timed trials, SUS after each variant, final ranking."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section V-E: 'We recruited 60 participants via flyers, mailing lists, social media, and word of mouth.' Demographics, age range (20-62), gender (10F/50M), and expertise levels reported."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The GUI automatically recorded actions and scores with timestamps (Section V-F). Score normalization explained (Section VI-A). One participant excluded and replaced with reason stated (Section V-E). Statistical analysis pipeline described with model selection criteria (Section VI)."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funding disclosed: 'This project is supported by the Wallenberg AI, Autonomous Systems, and Software Program (WASP) funded by the Knut and Alice Wallenberg Foundation.'"
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations clearly listed: ABB Robotics, KTH, ETH Zürich, ABB Corporate Research, Ericsson. Multiple authors have ABB affiliations."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The Wallenberg Foundation is an independent research foundation, not a product vendor. While ABB authors are listed, the funder (WASP/Wallenberg) has no direct commercial interest in the outcome."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement is present in the paper. Multiple authors are affiliated with ABB (a robot manufacturer), and the tool could have commercial implications, but no financial interest disclosure is made."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. GPT-4 is used as a component in the system, but the evaluation is of the human-AI system, not of GPT-4's knowledge."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not evaluating a pre-trained model on a benchmark — this is a user study of a human-AI collaborative system."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Not evaluating a pre-trained model on a benchmark — this is a user study of a human-AI collaborative system."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Section V-D: 'We pre-registered our hypotheses and planned analyses (both confirmatory and exploratory) on the Open Science Framework (OSF)' with link https://osf.io/ax5gb/overview."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "Section V-F states 'This study followed the ethical guidelines for Sweden' but does not mention IRB or ethics board approval specifically."
    262       },
    263       "demographics_reported": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Section V-E: ages 20-62 (M=29.7, SD=8.9), 10 female/50 male, primarily engineering/CS students or professional developers. Table II reports familiarity scores across four domains."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No explicit inclusion or exclusion criteria stated. Participants described as 'primarily university students of engineering or computer science or professional software developers' but no screening criteria are mentioned."
    272       },
    273       "randomization_described": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "Section V-B: 'We randomized all the experiments in advance, ensuring that all ablations occurred the same number of times, and that the order in which tasks and variants occurred was also counterbalanced.' Randomization code available on GitHub."
    277       },
    278       "blinding_described": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No mention of blinding. Participants could see which GUI variant they were using (e.g., presence/absence of AI features was obvious). No discussion of whether this awareness affected behavior."
    282       },
    283       "attrition_reported": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "Section V-E: 'After the experiments, we excluded one participant due to a one-time bug in the GUI, and recruited a new participant to redo the excluded experiment.' Final dataset: 60 valid participants."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The system uses GPT-4 API calls but no cost, latency, or token consumption figures are reported."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Hardware specified: 'Lenovo ThinkPad laptop with an external mouse and an Intel(R) Core(TM) Ultra 9 185H 2.30 GHz CPU' (Section V-F). Each experiment was 15 minutes per task. Total: 60 participants × 3 tasks × 15 min = 45 hours of experiment time."
    299       }
    300     }
    301   },
    302   "claims": [
    303     {
    304       "claim": "BETR-GUI with full AI assistant enables users to perform significantly better than manual-only programming",
    305       "evidence": "Table V: FULL vs MANUAL_ONLY b=61.05, p<.001. Mean scores 91.14 vs 30.24 (Table III). LMM R²=0.62.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Removing BO or GP individually does not significantly degrade performance compared to FULL",
    310       "evidence": "Table V: FULL vs NO_BO b=1.71, p=.999; FULL vs NO_GP b=8.52, p=.807.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The LLM and planner are the critical AI components — removing either eliminates the advantage over manual-only",
    315       "evidence": "Table V: NO_LLM vs MANUAL_ONLY b=-8.83, p=.783; NO_PLANNER vs MANUAL_ONLY b=-13.32, p=.358 (both non-significant).",
    316       "supported": "strong"
    317     },
    318     {
    319       "claim": "Humans using the full system outperform the AI assistant running alone",
    320       "evidence": "Table VI: FULL vs NO_HUMAN b=3.08, p<.01. Mean scores 91.1 vs 88.06. R²=0.04.",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "System usability follows the same pattern as task performance across GUI variants",
    325       "evidence": "Table IX shows same grouping: FULL/NO_BO/NO_GP significantly higher SUS than NO_LLM/NO_PLANNER/MANUAL_ONLY.",
    326       "supported": "strong"
    327     }
    328   ],
    329   "red_flags": [
    330     {
    331       "flag": "ABB author affiliations with potential commercial interest",
    332       "detail": "Multiple authors are affiliated with ABB Robotics and ABB Corporate Research. The BETR-GUI tool could have commercial value for ABB's robot programming products, but no competing interests statement is included."
    333     },
    334     {
    335       "flag": "Gender imbalance in participants",
    336       "detail": "50 male vs 10 female participants (83% male). The paper does not discuss whether this imbalance limits generalizability or whether gender was included as a covariate."
    337     },
    338     {
    339       "flag": "Small effect size for human vs AI-only comparison",
    340       "detail": "The FULL vs NO_HUMAN comparison has R²=0.04, a very small effect size (3.08 point difference on a 0-100 scale). While statistically significant, the practical significance is questionable."
    341     },
    342     {
    343       "flag": "Unequal cell sizes across ablation conditions",
    344       "detail": "FULL and MANUAL_ONLY had 60 observations each while ablation conditions had only 15 each (Table I). The mixed model accounts for this but statistical power for ablation comparisons is lower."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Measuring the impact of early-2025 ai on experienced open-source developer productivity",
    350       "authors": [
    351         "J. Becker",
    352         "N. Rush",
    353         "E. Barnes",
    354         "D. Rein"
    355       ],
    356       "year": 2025,
    357       "arxiv_id": "2507.09089",
    358       "relevance": "Counter-evidence showing software developers could perform worse with AI assistants, directly motivating this study's research question."
    359     },
    360     {
    361       "title": "A survey of Behavior Trees in robotics and AI",
    362       "authors": [
    363         "M. Iovino",
    364         "E. Scukins",
    365         "J. Styrud",
    366         "P. Ögren",
    367         "C. Smith"
    368       ],
    369       "year": 2022,
    370       "relevance": "Comprehensive survey of BT methods in robotics including learning, planning, and LLM approaches relevant to AI-assisted programming."
    371     },
    372     {
    373       "title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity",
    374       "authors": [
    375         "P. Shojaee",
    376         "I. Mirzadeh",
    377         "K. Alizadeh",
    378         "M. Horton",
    379         "S. Bengio",
    380         "M. Farajtabar"
    381       ],
    382       "year": 2025,
    383       "relevance": "Documents LLM limitations in complex long-horizon planning tasks, contextualizing why combining LLMs with other methods is beneficial."
    384     },
    385     {
    386       "title": "Automatic behavior tree expansion with llms for robotic manipulation",
    387       "authors": [
    388         "J. Styrud",
    389         "M. Iovino",
    390         "M. Norrlöf",
    391         "M. Björkman",
    392         "C. Smith"
    393       ],
    394       "year": 2025,
    395       "relevance": "Direct predecessor work (BETR-XP-LLM) combining LLMs with planners for BT creation, which BETR-GUI builds upon."
    396     },
    397     {
    398       "title": "ChatDev: Communicative agents for software development",
    399       "authors": [
    400         "ChatDev team"
    401       ],
    402       "year": 2023,
    403       "relevance": "Multi-agent LLM system for software development, relevant to AI-assisted programming paradigms."
    404     },
    405     {
    406       "title": "LLM+P: Empowering large language models with optimal planning proficiency",
    407       "authors": [
    408         "B. Liu"
    409       ],
    410       "year": 2023,
    411       "arxiv_id": "2304.11477",
    412       "relevance": "Combines LLMs with PDDL planners, a key technique used in BETR-GUI's AI assistant pipeline."
    413     }
    414   ],
    415   "engagement_factors": {
    416     "practical_relevance": {
    417       "score": 1,
    418       "justification": "The BETR-GUI tool is niche to robotics behavior tree programming, not broadly applicable to most developers' daily work."
    419     },
    420     "surprise_contrarian": {
    421       "score": 1,
    422       "justification": "The finding that humans+AI outperform AI alone is mildly interesting but largely expected; the ablation showing LLM and planner are critical while GP/BO are not is a minor surprise."
    423     },
    424     "fear_safety": {
    425       "score": 0,
    426       "justification": "No safety, security, or risk concerns are raised; this is a constructive tool for robot programming."
    427     },
    428     "drama_conflict": {
    429       "score": 0,
    430       "justification": "No controversy, no company claims challenged, no conflict angle."
    431     },
    432     "demo_ability": {
    433       "score": 1,
    434       "justification": "Code is on GitHub but requires Unity simulation, PyQt5, and GPT-4 API access — significant setup effort."
    435     },
    436     "brand_recognition": {
    437       "score": 0,
    438       "justification": "Authors are from KTH, ABB, and ETH — recognized in robotics but not household names in the broader tech community."
    439     }
    440   }
    441 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs