scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29553B)
      1 {
      2   "paper": {
      3     "title": "How Do Data Analysts Respond to AI Assistance? A Wizard-of-Oz Study",
      4     "authors": [
      5       "Ken Gu",
      6       "Madeleine Grunde-McLaughlin",
      7       "Andrew McNutt",
      8       "Jeffrey Heer",
      9       "Tim Althoff"
     10     ],
     11     "year": 2024,
     12     "venue": "CHI '24 (International Conference on Human Factors in Computing Systems)",
     13     "arxiv_id": "2309.10108",
     14     "doi": "10.1145/3613904.3641891"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [],
     18   "methodology_tags": ["qualitative"],
     19   "key_findings": "In a Wizard-of-Oz study with 13 data analysts, planning assistance (helping analysts reason about analysis decisions) was valued alongside execution assistance (code help), with 12/13 analysts noting planning suggestions presented ideas they hadn't previously considered. Analysts integrated applicable planning suggestions 51.6% of the time (47/91). Suggestion helpfulness depended heavily on contextual factors: analyst background (statistical/domain expertise), timing relative to the analyst's current task, and the match between the suggestion and the analyst's own plan. The study also revealed a tension between analysts' goals (speed, completion) and the assistant's goals (robust planning), with some analysts becoming overreliant and disengaging from critical thinking.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The authors release the JupyterLab extension code: 'we release the code for this interface at https://github.com/behavioral-data/Data-Assistant-Interface.' Study materials are also available on OSF at https://osf.io/9x8bj/."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Study materials are available on OSF (https://osf.io/9x8bj/). The underlying dataset is from a published crowd-sourced analysis study (Silberzahn et al. 2018) which is publicly available. The 32 pre-written suggestions are included in the appendix."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions the study used a MacBook Pro with a 27-inch monitor and JupyterLab but does not specify software versions or dependencies needed to run the extension."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "While the study procedure is described in detail (Sec. 4) and materials are on OSF, there are no step-by-step instructions for setting up the JupyterLab extension, configuring the wizard interface, or reproducing the technical infrastructure of the study."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports descriptive counts (e.g., '8/13 analysts', '51.6% of the time') and means with standard deviations ('11.85 (std=4.56) suggestions') but no confidence intervals or error bars on any results."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No statistical significance tests are used. The paper makes comparative observations (e.g., planning vs execution preferences, timing effects) based on qualitative analysis and simple counts without any formal tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper reports descriptive proportions (e.g., 47/91 integration rate) but no formal effect size measures."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The sample of n=13 is not justified with a power analysis or saturation argument. The paper mentions selecting from 60 volunteers based on proficiency criteria and taking 13 'on a first come basis' (Sec. 4) without discussing adequacy of this sample size."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Standard deviations are reported for key descriptive statistics: 'on average 11.85 (std=4.56) suggestions, 9.85 (std=4.16) of which were planning suggestions' (Sec. 5/Fig. 5)."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No baseline comparison is included. All 13 participants received the same WoZ-assisted condition; there is no control group (e.g., analysts working without the assistant) or comparison against an existing tool like Copilot."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No baselines are included, so contemporaneity cannot be assessed."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is a Wizard-of-Oz study observing analyst behavior, not a system with decomposable components to ablate."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple measures are used: positive/neutral/negative helpfulness ratings (Fig. 5), suggestion integration rate (51.6%), semi-structured interview data, and behavioral observations of analysts' workflows."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The entire study IS a human evaluation. Participants rated each suggestion as positive, neutral, or negative during the semi-structured interview, and the wizard observed analyst behavior throughout (Sec. 4)."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a qualitative user study, not a machine learning evaluation. There is no test set concept applicable here."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Figure 5 provides a detailed per-category breakdown of suggestion helpfulness across all 8 suggestion categories (domain background, data wrangling, conceptual model formulation, etc.), showing positive/neutral/negative ratings per category."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper extensively discusses when suggestions failed: timing mismatches (Sec. 5.3), expertise mismatches (Sec. 5.1.2), distraction and overreliance (Sec. 5.4), excessive explanation length, and unfamiliar statistical concepts that confused participants."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative findings are reported: A8 described becoming a 'clicking machine' rather than an analytical thinker (Sec. 5.4), some analysts found suggestions distracting from their own plans (Sec. 5.3), and expertise mismatches led to confusion or ignoring suggestions entirely (Sec. 5.1.2)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims about 'subtleties in contextual factors that impact suggestion helpfulness' and 'design implications for supporting different abstractions of assistance, forms of initiative, increased engagement, and alignment of goals' are all substantiated by findings in Sections 5 and 6."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper makes implicit causal claims such as 'well-timed and contextual planning assistance often helped analysts consider and make alternative decisions' (Sec. 5) and 'some analysts became distracted by the novelty of using an assistant' (Sec. 5.1.3). However, the WoZ design lacks a control group, so these causal attributions are not rigorously justified — effects could be due to the wizard's expertise, Hawthorne effects, or the lab setting."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Section 6.2 explicitly bounds generalizations: 'we selected analysts who self-identified as having a high proficiency,' acknowledges the single task/dataset limitation, notes the same-day lab design, and states 'participants may have been more willing to accept planning and execution assistance' in a lab setting."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper discusses several alternative explanations: novelty effects ('if analysts had more time... they may have... overcome any potential novelty effects,' Sec. 6.2), lab setting effects ('may not have presented the same stakes as a real-world one'), and self-selection bias ('we focused on people willing to participate and perform data analysis with an AI assistant, which may positively bias our results')."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly acknowledges the gap between perceived helpfulness (what they measured) and actual analysis quality (the real outcome): 'our study concentrated on analyst preferences, leaving evaluation of the resulting quality of analyses unaddressed' (Sec. 6.2). They also discuss the tension between analyst goals (speed/completion) and the assistant's goals (robust planning)."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper uses 'ChatGPT [90]' throughout without specifying which model version (GPT-3.5, GPT-4, specific snapshot). The reference [90] points to 'ChatGPT: Conversational AI Language Model' (2022) but no API version or model ID is given."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The wizard's prompting strategy is described at a high level ('we created a general prompt introducing the task and dataset,' 'the wizard crafted prompts in real-time using the notebook context,' Sec. 4) but the actual prompt text is not provided anywhere in the paper or appendix."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the ChatGPT interactions used to generate suggestions."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. ChatGPT is used as a manual tool by the human wizard who controls all interactions; there is no autonomous agent pipeline."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The paper describes dataset preparation ('We sampled a subset of the data to simplify computational manipulation while maintaining the overall distribution,' Sec. 4), suggestion preparation (32 pre-written suggestions, Table 1 and Appendix A), and the qualitative coding process ('two of the authors conducted iterative open coding on the recordings,' Sec. 4)."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6.2 'Limitations and Future Work' is a dedicated subsection with substantive discussion of multiple limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats are discussed: high-proficiency participant selection ('analysts with less statistical and programming expertise may have different experiences'), single task ('we chose to conduct a same-day, in-person study of two hours'), lab vs real-world stakes ('may not have presented the same stakes'), positive selection bias, and novelty effects (Sec. 6.2)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly states what results do NOT show: 'our study concentrated on analyst preferences, leaving evaluation of the resulting quality of analyses unaddressed' and 'Determining analysis correctness is inherently challenging' (Sec. 6.2). The WoZ limitations section (Sec. 4) also states 'our goals in this work are to better understand the design of such assistants rather than their implementation.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Raw data (session recordings, interview transcripts, wizard logs) are not released. The OSF repository contains study materials and the supplementary material, but not the raw participant data."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is well described: Zoom recordings with consent, coordinator note-taking, wizard observation, semi-structured interviews with suggestion-by-suggestion review, and behavioral logging of suggestion integration (Sec. 4)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Recruitment is described: 'We contacted potential participants through analysis-related mailing lists at our institution. From a pool of 60 volunteers, we invited those who rated their programming and statistics proficiency 4 out of 5 or greater and were familiar with computational notebooks, selecting a final 13 on a first come basis' (Sec. 4)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The analysis pipeline is documented: 'The study coordinator took notes throughout... One author viewed the recordings, transcribed relevant episodes, and logged whether suggestions were included in participants' working analyses. To define common themes that emerged, two of the authors conducted iterative open coding on the recordings' (Sec. 4)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Funding is disclosed in the Acknowledgments: 'T.A. and K.G. were supported in part by NSF grant IIS-1901386, NSF CAREER IIS-2142794, and the Bill & Melinda Gates Foundation (INV-004841).'"
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All five authors are listed with University of Washington affiliations. No product is being evaluated that would create a conflict."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "NSF and the Bill & Melinda Gates Foundation are independent research funders with no financial stake in whether AI assistants help data analysts."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is included in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This is a Wizard-of-Oz user study about human-AI interaction, not a benchmark evaluation of model capability. ChatGPT is used as a tool by the wizard, not evaluated on a benchmark."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No benchmark evaluation of pre-trained model capability is conducted. The study evaluates human reactions to suggestions, not model performance."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No benchmark evaluation is involved; this is a qualitative user study."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No pre-registration is mentioned. While study materials are on OSF (https://osf.io/9x8bj/), there is no reference to a pre-registered analysis plan."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "No mention of IRB or ethics board approval anywhere in the paper, despite involving 13 human participants in a recorded lab study."
    259       },
    260       "demographics_reported": {
    261         "applies": true,
    262         "answer": true,
    263         "justification": "Table 2 provides detailed participant demographics: gender, occupation/background, AI code assistant usage frequency, preferred programming language, and self-rated language and statistics experience (on a 5-point scale) for all 13 participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": true,
    267         "answer": true,
    268         "justification": "Inclusion criteria are stated: 'we invited those who rated their programming and statistics proficiency 4 out of 5 or greater and were familiar with computational notebooks' (Sec. 4). The exclusion rationale is also given: 'We chose not to consider novices, as they would be too limited by execution challenges to be able to benefit from our analysis planning.'"
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "This is not a between-subjects experimental study. All participants received the same WoZ-assisted condition; there are no treatment groups requiring randomization."
    274       },
    275       "blinding_described": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Blinding is a core feature of the WoZ design: 'participants interacted with a data analysis assistant... which, unbeknownst to them, was controlled by a human wizard' (Sec. 4). Participants believed they were interacting with an AI assistant. 'At the end of the study, we revealed how the assistant actually worked.'"
    279       },
    280       "attrition_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No mention of whether any participants dropped out, experienced technical issues, or failed to complete the study. All 13 appear in Table 2 and results, but attrition is not explicitly addressed."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a qualitative Wizard-of-Oz user study, not a system deployment. Inference cost is not relevant to the study's claims about human-AI interaction dynamics."
    291       },
    292       "compute_budget_stated": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "This is a qualitative user study; computational budget is not relevant to the study's claims."
    296       }
    297     }
    298   },
    299   "claims": [
    300     {
    301       "claim": "Analysts integrated applicable planning suggestions 51.6% of the time (47/91 suggestions).",
    302       "evidence": "Figure 5 and Sec. 5 report that 'for planning suggestions which analysts could reasonably incorporate into their notebooks (i.e., those that were not results interpretation and domain background), analysts integrated suggestions 51.6% of the time (47/91).'",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Planning assistance helped most analysts consider decisions they had not previously contemplated.",
    307       "evidence": "12/13 analysts 'noted that planning suggestions often presented ideas that they had not previously considered' (Sec. 5.2.1). Multiple quotes support this, e.g., A10: 'This agent could take a more overall approach and help you think about the overall approach.'",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Suggestion helpfulness is strongly influenced by timing relative to the analyst's current task.",
    312       "evidence": "9/13 analysts felt that 'unhelpful' suggestions would have been useful at a different moment (Sec. 5.3). Multiple participants describe ignoring good suggestions due to being 'focused on the variables at the time' (A3) or 'trying to complete what I already started' (A5).",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Analyst background (statistical expertise, domain knowledge, AI experience) impacts perception of suggestion helpfulness.",
    317       "evidence": "Sections 5.1.1-5.1.3 document varying reactions based on expertise level. A1 (statistics professor) found results interpretation suggestions unhelpful, while other analysts appreciated them. Some analysts (A3, A5, A13) ignored unfamiliar statistical suggestions entirely.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "AI assistance can reduce analysts' critical thinking and lead to overreliance.",
    322       "evidence": "A8 stated the assistant 'made me this clicking machine (rather) than an analytical thinker... Autopilot was a welcome path that I could choose' (Sec. 5.4). However, this observation comes primarily from one participant.",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "There is a tension between analyst goals (speed and completion) and planning assistance goals (methodical and robust planning).",
    327       "evidence": "Section 6 discusses this tension: 'we found differences in goals between the analyst (i.e., speed and completion) and planning assistance (i.e., methodical and robust planning).' Analysts frequently preferred execution assistance for its time-saving benefits while finding planning assistance more cognitively demanding.",
    328       "supported": "moderate"
    329     }
    330   ],
    331   "red_flags": [
    332     {
    333       "flag": "Small sample size without justification",
    334       "detail": "n=13 participants selected on a first-come basis from 60 volunteers. No sample size justification, saturation analysis, or power analysis is provided. While small samples are common in qualitative HCI research, quantitative claims like '51.6% integration rate' are made from this small base."
    335     },
    336     {
    337       "flag": "No control group",
    338       "detail": "All participants received the WoZ-assisted condition. Without a control group (analysts working without the assistant), it's impossible to attribute observed behaviors to the assistant rather than the task, lab setting, or think-aloud protocol."
    339     },
    340     {
    341       "flag": "Single task and dataset",
    342       "detail": "All 13 analysts worked on the same soccer red card analysis task with the same dataset. Findings may be specific to this moderate-complexity statistical analysis domain and may not generalize to other types of data analysis."
    343     },
    344     {
    345       "flag": "IRB/ethics approval not mentioned",
    346       "detail": "The study involves 13 human participants in a recorded lab study with a deception element (WoZ protocol), yet no IRB or ethics board approval is mentioned."
    347     },
    348     {
    349       "flag": "Wizard introduces systematic bias",
    350       "detail": "The wizard manually decided timing and selection of suggestions, creating a confound: observed effects may reflect the wizard's expertise in choosing when to intervene rather than properties of AI-based assistance. The paper acknowledges this ('we chose to conduct our study with a single analysis task') but the wizard's judgment is a substantial uncontrolled variable."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    356       "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"],
    357       "year": 2022,
    358       "relevance": "Empirical study of how programmers interact with code-generating AI models, identifying acceleration and exploration modes relevant to AI-assisted programming."
    359     },
    360     {
    361       "title": "Evaluating Large Language Models Trained on Code",
    362       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    363       "year": 2021,
    364       "arxiv_id": "2107.03374",
    365       "relevance": "Foundational Codex evaluation paper establishing benchmarks for LLM code generation capability."
    366     },
    367     {
    368       "title": "On the Design of AI-powered Code Assistants for Notebooks",
    369       "authors": ["Andrew M. McNutt", "Chenglong Wang", "Robert DeLine", "Steven Mark Drucker"],
    370       "year": 2023,
    371       "relevance": "Closely related work studying computational notebooks as a medium for AI-based execution assistance for data scientists."
    372     },
    373     {
    374       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    375       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L. Glassman"],
    376       "year": 2022,
    377       "relevance": "Usability evaluation of LLM-powered code generation tools, finding gaps between user expectations and actual experience."
    378     },
    379     {
    380       "title": "Productivity assessment of neural code completion",
    381       "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "Shawn Simister"],
    382       "year": 2022,
    383       "relevance": "Empirical assessment of AI code completion productivity impacts, relevant to understanding execution assistance benefits."
    384     },
    385     {
    386       "title": "The Programmer's Assistant: Conversational Interaction with a Large Language Model for Software Development",
    387       "authors": ["Steven I. Ross", "Fernando Martinez", "Stephanie Houde", "Michael J. Muller", "Justin D. Weisz"],
    388       "year": 2023,
    389       "relevance": "Study of conversational LLM interaction for software development, examining how programmers use AI assistants."
    390     },
    391     {
    392       "title": "Understanding the Usability of AI Programming Assistants",
    393       "authors": ["Jenny Liang", "Chenyang Yang", "Brad A. Myers"],
    394       "year": 2023,
    395       "arxiv_id": "2303.17125",
    396       "relevance": "Study of usability challenges in AI programming assistants, relevant to understanding human-AI interaction in coding."
    397     },
    398     {
    399       "title": "Sparks of Artificial General Intelligence: Early experiments with GPT-4",
    400       "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan"],
    401       "year": 2023,
    402       "arxiv_id": "2303.12712",
    403       "relevance": "Early GPT-4 capability evaluation across domains including coding, relevant to understanding LLM capabilities for data analysis assistance."
    404     },
    405     {
    406       "title": "LIDA: A Tool for Automatic Generation of Grammar-Agnostic Visualizations and Infographics using Large Language Models",
    407       "authors": ["Victor C. Dibia"],
    408       "year": 2023,
    409       "arxiv_id": "2303.02927",
    410       "relevance": "LLM-based tool for automated visualization generation, relevant to AI-assisted data analysis workflows."
    411     },
    412     {
    413       "title": "How Do Analysts Understand and Verify AI-Assisted Data Analyses?",
    414       "authors": ["Ken Gu", "Ruoxi Shang", "Tim Althoff", "Chenglong Wang", "Steven Mark Drucker"],
    415       "year": 2023,
    416       "arxiv_id": "2309.10947",
    417       "relevance": "Companion study by some of the same authors on understanding and verifying AI-assisted data analysis outputs."
    418     },
    419     {
    420       "title": "To trust or to think: cognitive forcing functions can reduce overreliance on AI in AI-assisted decision-making",
    421       "authors": ["Zana Buçinca", "Maja Barbara Malaya", "Krzysztof Z Gajos"],
    422       "year": 2021,
    423       "relevance": "Study on overreliance in AI-assisted decision-making and cognitive forcing functions to mitigate it, directly relevant to the overreliance findings in this paper."
    424     },
    425     {
    426       "title": "Explanations can reduce overreliance on ai systems during decision-making",
    427       "authors": ["Helena Vasconcelos", "Matthew Jörke", "Madeleine Grunde-McLaughlin"],
    428       "year": 2023,
    429       "relevance": "Study on how explanations affect overreliance in AI systems, directly relevant to the suggestion explanation findings in this paper."
    430     }
    431   ],
    432   "engagement_factors": {
    433     "practical_relevance": {
    434       "score": 2,
    435       "justification": "The 7 design guidelines (Table 3) are actionable for teams building data analysis assistants, and the JupyterLab extension is released, though it's a research prototype not a production tool."
    436     },
    437     "surprise_contrarian": {
    438       "score": 1,
    439       "justification": "The finding that planning assistance can cause overreliance and 'autopilot' behavior is somewhat counterintuitive, but most findings confirm expected patterns about timing, expertise matching, and user preferences."
    440     },
    441     "fear_safety": {
    442       "score": 0,
    443       "justification": "No AI safety, security, or risk concerns are raised."
    444     },
    445     "drama_conflict": {
    446       "score": 0,
    447       "justification": "No controversial claims or conflict with industry narratives."
    448     },
    449     "demo_ability": {
    450       "score": 1,
    451       "justification": "The JupyterLab extension code is released on GitHub, but it requires a human wizard to operate and is not a standalone demo."
    452     },
    453     "brand_recognition": {
    454       "score": 1,
    455       "justification": "University of Washington is a well-known institution and CHI is a top venue, but the work doesn't involve major AI products or labs. ChatGPT is mentioned as a tool used by the wizard."
    456     }
    457   }
    458 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs