scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29895B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring AI-Augmented Sensemaking of Patient-Generated Health Data: A Mixed-Method Study with Healthcare Professionals in Cardiac Risk Reduction",
      6     "authors": [
      7       "Pavithren V. S. Pakianathan",
      8       "Rania Islambouli",
      9       "Diogo Branco",
     10       "Albrecht Schmidt",
     11       "Tiago Guerreiro",
     12       "Jan David Smeddinck"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv",
     16     "arxiv_id": "2602.05687",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract claims are hedged appropriately ('Findings show that AI summaries provided quick overviews', 'HCPs raised concerns about transparency, privacy, and overreliance'). These match the qualitative and quantitative findings in the results section.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper is carefully formative and exploratory. It uses language like 'perceived', 'valued', 'suggested' rather than causal claims. The within-subjects comparison is appropriate for the claims made. The paper explicitly states it does not 'evaluate clinical effectiveness.'",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper explicitly states findings should be 'interpreted as insights into perceptions, practices, and sensemaking dynamics rather than as evaluations of clinical performance.' The Limitations section bounds scope to controlled conditions, single-user sessions, synthetic data, and the specific clinical domain.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper discusses novelty effects (mitigated via demo mode), notes that lack of time pressure may have muted quantitative differences, discusses that synthetic data may not capture real-world variability, and acknowledges heterogeneity in human-AI collaboration effects (citing Vaccaro et al.).",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly frames this as studying 'perceptions, workflow fit, and sociotechnical considerations' rather than clinical effectiveness. It distinguishes between perceived value (what they measured) and actual clinical impact (not claimed). The paper states 'Our aim is not to evaluate clinical effectiveness.'",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 5.4 is a dedicated 'Limitation' section with substantial discussion of multiple limitations.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats include: LLM not pre-trained on sensor data so summaries may be inaccurate, no strict time limits muting efficiency gains, N=16 underpowered for non-parametric tests, synthetic rather than real PGHD, HCPs working alone rather than with patients present, study conducted in non-native English language.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states: 'Our aim is not to evaluate clinical effectiveness, but to generate insights into acceptable and effective interaction patterns.' It states findings reflect 'controlled conditions rather than deployment realities' and that triadic patient-HCP-AI interactions were not captured.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding disclosure, acknowledgments section, or grant information is found in the paper. Authors are affiliated with Ludwig Boltzmann Institute, LMU Munich, and LASIGE but no funding source is stated.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: Ludwig Boltzmann Institute for Digital Health and Prevention (Salzburg), LMU Munich, and LASIGE/Universidade de Lisboa.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding source is disclosed, so independence cannot be assessed.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined: PGHD is defined as 'health and lifestyle-related data that individuals collect outside traditional clinical settings'; sensemaking is defined as 'an iterative process of gathering and interpreting information to enable action'; LLM usage context is explained throughout.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Three explicit contributions are enumerated at the end of the introduction: (1) empirical insights from mixed-methods evaluation of HCP perceptions; (2) investigation of conversational interfaces for PGHD exploration; (3) sociotechnical understanding of LLM integration with design implications.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The related work section engages substantively with three distinct bodies of work (PGHD challenges, health data sensemaking, AI-augmented sensemaking), explicitly positioning this work as 'bridging these areas by investigating how LLM-generated summaries and natural language interfaces can support HCPs.'",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL or code archive is provided in the paper. The prototype was built using Plotly Dash but no source code is released.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The synthetic PGHD, personas, and collected study data are not released. No data download links are provided.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions Python and the Plotly Dash framework but provides no requirements.txt, dependency versions, or environment specification.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No reproduction instructions are provided. The paper describes the study apparatus but does not include steps to replicate the system or study.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Results report means and SDs (e.g., SUS AI mean=90.63, SD=8.44) but no confidence intervals or error bars. The Spearman correlation is reported with a p-value but no CI.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Wilcoxon signed-rank tests are used for paired comparisons of NASA-TLX and SUS scores. Spearman correlations are used for trust-confidence associations. Linear mixed-effects models are also mentioned for robustness verification.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper reports Spearman r=.46 for trust-confidence correlation, and provides raw means enabling comparison (e.g., NASA-TLX AI=24.53 vs No-AI=27.40, a ~3.9-point reduction). SUS scores and confidence ratings are reported with means providing baseline context.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The sample of 16 HCPs is not justified with a power analysis. The limitations section acknowledges the sample was 'underpowered for detecting small or medium effects' but no a priori justification is given.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": true,
    175           "justification": "Standard deviations are reported alongside means for all quantitative measures (e.g., SUS AI: SD=8.44, NASA-TLX AI: SD=11.99, confidence AI: SD=0.67, trust: SD=0.98).",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "The study uses a within-subjects design comparing AI Summary vs No-AI Summary conditions, providing a direct baseline comparison.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The baseline (no AI summary, charts only) is appropriate for this formative study comparing AI-augmented vs non-augmented sensemaking of the same data.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": false,
    194           "answer": false,
    195           "justification": "This is not a system with separable components to ablate. It is an HCI study comparing two conditions (with/without AI summaries) plus a conversational interface exploration.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple measures are used: NASA-TLX (workload, 6 subscales), SUS (usability), self-assessed confidence, trust in AI summaries, MiniVLAT (visualization literacy), plus qualitative thematic analysis.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "The entire study IS a human evaluation — 16 HCPs reviewed the system, rated it on multiple scales, and provided qualitative feedback through semi-structured interviews.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "This is a user study, not a benchmark evaluation. There is no train/test split.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "NASA-TLX subscale breakdowns are provided (Figure 6 spider chart). Qualitative themes are broken down by category (sensemaking value, workflow augmentation, risks/tensions). Provenance analysis is broken down by modality (Tables 4, 5).",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "P12 identified a blood pressure classification error in one summary. The paper discusses concerns about accuracy, overreliance, and deskilling. The provenance analysis reveals MAPD variability across modalities.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper honestly reports that workload differences were not statistically significant, SUS differences were not significant, and that confidence was comparable across conditions (AI: M=4.38 vs No-AI: M=4.27).",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "The paper specifies 'GPT-4-Turbo' for data generation and the appendix provides the model name. Temperature (0.5) and max tokens (1024) are specified in the appendix prompt configuration.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Full prompt texts are provided in Appendix A.2 for all four data modalities (physical activity, sedentary time, blood pressure, sleep) and the combined data prompt. The prompts include role descriptions, formatting instructions, column definitions, and data passed.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "The appendix specifies: Model: gpt-4-turbo, Temperature: 0.5, Max tokens: 1024.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The system makes direct LLM API calls for summary generation and conversational interaction without agent loops, tool use, or retry logic.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3.2 describes synthetic data generation: persona design with SCORE2 risk stratification, building on Henriksen et al. dataset, generated with ChatGPT using Python randomization functions, one year of multimodal PGHD across four modalities. Data verified by lead researcher and two HCPs.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Neither the synthetic PGHD, questionnaire responses, interview transcripts, nor chat logs are released for independent verification.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.3 (Procedure) details the 75-minute session structure: demographics questionnaire, MiniVLAT, demo familiarization, two experimental conditions with randomized order, conversational interface interaction, and semi-structured interview. Section 3.6 describes all instruments used.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Section 3.4 states: 'We recruited participants via an email publicity sent to HCPs working in cardiac care at a university hospital, which was subsequently shared within their professional networks.'",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The study procedure is documented step by step (Figure 5). Audio transcription used OpenAI Whisper, verified by first author. Qualitative coding: two authors co-coded 4 participants, then first author coded remaining. Quantitative analysis methods specified (Wilcoxon, Spearman, linear mixed-effects). Post-hoc provenance analysis of LLM outputs is documented.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It is a user study evaluating HCPs' sensemaking with an LLM-augmented interface.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not a benchmark evaluation. The study evaluates HCP perceptions, not model performance on test data.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not a benchmark evaluation study.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration link or mention of pre-registration is found in the paper.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": true,
    323           "justification": "Section 3.5 states: 'Our study protocol received official approval from the relevant institutional ethics committee (blinded for review) prior to data collection.'",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Section 3.4 reports: 12 women, 4 men; M=31.4 years, SD=5.0; specialized in cardiovascular rehabilitation; M=9.1 years experience, SD=5.5, range 2-20; 12 regularly used PGHD; MiniVLAT M=9.47/12; AI usage patterns detailed.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "The paper states participants were HCPs in cardiac care recruited via email but does not specify formal inclusion/exclusion criteria. It notes none participated in prior co-design activities, but this is the only stated criterion.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": true,
    341           "justification": "Section 3.3 states conditions were presented in 'randomized order.' Personas were 'stratified by CVD risk: medium, high, very high' with 'three unique personas' per condition block. Figure 5 shows the randomized structure.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "No blinding is described. The AI vs No-AI conditions were visually distinct (presence/absence of summaries), and participants were aware of which condition they were in. This is acknowledged implicitly but not discussed as a limitation.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No attrition or dropout information is reported. The paper simply states 16 HCPs participated without mentioning whether any were excluded or dropped out.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No API costs, token counts, or inference latency for the GPT-4-Turbo calls are reported, despite the system making multiple LLM calls per session.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No computational budget is stated for the LLM usage in the study.",
    368           "source": "opus"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "LLM-generated summaries reduced the time and effort required for HCPs to make sense of multimodal PGHD",
    376       "evidence": "15/16 participants reported summaries were valuable for reducing information overload with specific qualitative quotes about time savings. However, NASA-TLX showed no statistically significant difference (AI: 24.53 vs No-AI: 27.40, p=NS).",
    377       "supported": "weak"
    378     },
    379     {
    380       "claim": "AI summaries anchored data exploration by shifting HCPs from scanning all charts to confirming specific graphs against summary details",
    381       "evidence": "Multiple participant quotes describe this behavioral shift (P2: 'I already know the summary so my eyes would be more focused on confirming it'; P5: offloaded 'making a summary in my head'). No behavioral measurement was performed.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "Conversational interfaces bridge data literacy gaps by enabling HCPs without data science expertise to conduct analyses they could not perform independently",
    386       "evidence": "Qualitative quotes from P4 ('We don't have the skills in terms of data analysis') and P5 ('the chatbot... that really helps!'). No objective literacy-gap measurement was conducted.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "Trust in AI summaries correlates positively with HCP confidence in clinical plans (Spearman r=.46, p=.001)",
    391       "evidence": "Quantitative finding from 16 participants. Trust measured on 1-5 Likert scale; confidence measured on 1-5 Likert scale per persona. Statistical test is reported with result.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Integrating AI features did not significantly increase workload or reduce usability compared to no-AI condition",
    396       "evidence": "SUS: AI=90.63 vs No-AI=85.94 (NS); NASA-TLX total: AI=24.53 vs No-AI=27.40 (NS). Wilcoxon signed-rank tests used for all comparisons.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "LLM-generated summaries were broadly factually accurate with mean absolute percentage difference of 3.96% for holistic insights and 2.68% for chat logs",
    401       "evidence": "Post-hoc provenance analysis compared LLM-derived statistics against ground-truth synthetic data values across 184 instances (holistic) and 30 instances (chat). All 25 value ranges checked were accurate.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "HCPs perceive overreliance, deskilling, transparency gaps, and privacy as significant barriers to clinical AI adoption",
    406       "evidence": "Consistent qualitative themes across multiple participants with specific quotes: P8 on deskilling, P12 on 'blind trust,' P9/P10 on data privacy concerns, P4 on need for AI-generated content disclaimers. Well-supported qualitative finding.",
    407       "supported": "strong"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "qualitative",
    412     "observational"
    413   ],
    414   "key_findings": "A mixed-methods within-subjects study with 16 HCPs found that LLM-generated summaries and conversational interfaces were consistently perceived as valuable for reducing sensemaking effort and bridging data literacy gaps in cardiac risk reduction workflows, though no statistically significant differences in workload (NASA-TLX) or usability (SUS) were detected between AI and no-AI conditions. Trust in AI summaries correlated with confidence in clinical plans (Spearman r=.46), raising overreliance concerns that were independently confirmed in qualitative data. HCPs consistently raised concerns about deskilling, automation bias, transparency, and privacy as adoption barriers requiring design attention. The study relied entirely on synthetic PGHD and a single formative session with 16 participants, explicitly limiting claims to perceptions rather than clinical effectiveness.",
    415   "red_flags": [
    416     {
    417       "flag": "Synthetic data throughout",
    418       "detail": "All PGHD used in the study was generated by GPT-4-Turbo — the same model used for the evaluated summarization system. Synthetic data cannot capture real-world variability, noise, or missingness. Findings about HCP perceptions may not transfer to real patient data."
    419     },
    420     {
    421       "flag": "No significant quantitative effects",
    422       "detail": "NASA-TLX workload and SUS usability showed no statistically significant differences between AI and no-AI conditions despite strong qualitative claims about efficiency gains. The quantitative-qualitative mismatch is noted but not fully reconciled."
    423     },
    424     {
    425       "flag": "n=16 underpowered",
    426       "detail": "16 participants is acknowledged as 'underpowered for detecting small or medium effects in non-parametric tests.' No a priori power analysis was conducted. The sample is also demographically homogeneous (young HCPs in one country, English non-native)."
    427     },
    428     {
    429       "flag": "No pre-registration",
    430       "detail": "Despite measuring specific hypothesized outcomes with quantitative significance tests, the study was not pre-registered, raising risk of selective reporting and outcome switching."
    431     },
    432     {
    433       "flag": "Causal language without causal design",
    434       "detail": "Abstract states 'AI summaries provided quick overviews that anchored exploration' and 'conversational interaction bridged data-literacy gaps' using causal language, based on single-session qualitative self-reports from a formative study that explicitly disclaims causal inference."
    435     },
    436     {
    437       "flag": "No funding or competing interests disclosure",
    438       "detail": "No funding source is disclosed despite the study using OpenAI commercial APIs (GPT-4-Turbo) extensively for both data generation and the evaluated system. No competing interests statement is present."
    439     },
    440     {
    441       "flag": "No inclusion/exclusion criteria",
    442       "detail": "Formal participant eligibility criteria are not stated, limiting reproducibility of participant sampling and raising questions about selection bias in the convenience sample from one hospital's professional network."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "Narrating Fitness: Leveraging Large Language Models for Reflective Fitness Tracker Data Interpretation",
    448       "relevance": "Direct predecessor (CHI 2024) using LLMs to generate narratives from fitness tracker data — closest existing work to this paper's primary contribution"
    449     },
    450     {
    451       "title": "Vital Insight: Assisting Experts' Context-Driven Sensemaking of Multi-modal Personal Tracking Data Using Visualization and Human-In-The-Loop LLM Agents",
    452       "relevance": "Related work using LLM agents for expert sensemaking of multimodal personal tracking data — directly compared methodology"
    453     },
    454     {
    455       "title": "Common Barriers to the Use of Patient-Generated Data Across Clinical Settings",
    456       "relevance": "Foundational CHI 2018 study on PGHD integration barriers — directly motivates this paper's problem statement and design choices"
    457     },
    458     {
    459       "title": "Using Patient-Generated Data to Support Cardiac Rehabilitation and the Transition to Self-Care",
    460       "relevance": "CHI 2023 study on PGHD in cardiac rehabilitation — direct domain precedent establishing the clinical context"
    461     },
    462     {
    463       "title": "Augmenting clinicians' analytical workflow through task-based integration of data visualizations and algorithmic insights: a user-centered design study",
    464       "relevance": "JAMIA 2024 design study integrating algorithmic insights into clinical workflows — closely related design approach with similar HCP concerns identified"
    465     },
    466     {
    467       "title": "Understanding Clinician Perceptions of GenAI: A Mixed Methods Analysis of Clinical Documentation Tasks",
    468       "relevance": "Contemporary (2025) mixed-methods study on clinician GenAI acceptance — used to contextualize findings on HCP AI adoption barriers"
    469     },
    470     {
    471       "title": "From Classification to Clinical Insights: Towards Analyzing and Reasoning About Mobile and Behavioral Health Data With Large Language Models",
    472       "relevance": "Related LLM application for mobile health sensing data interpretation — comparable methodology in adjacent domain"
    473     },
    474     {
    475       "title": "When combinations of humans and AI are useful: A systematic review and meta-analysis",
    476       "relevance": "Nature Human Behaviour 2024 meta-analysis on human-AI collaboration effectiveness — cited for heterogeneity in human-AI collaboration effects relevant to interpreting results"
    477     }
    478   ],
    479   "engagement_factors": {
    480     "practical_relevance": {
    481       "score": 2,
    482       "justification": "Addresses a real, growing clinical problem (wearable data overload) with a concrete evaluated prototype, but synthetic data and n=16 limit immediate clinical applicability."
    483     },
    484     "surprise_contrarian": {
    485       "score": 1,
    486       "justification": "No surprising quantitative effects found. The trust-overreliance correlation confirms existing concerns rather than challenging them."
    487     },
    488     "fear_safety": {
    489       "score": 2,
    490       "justification": "Raises concrete patient safety concerns about automation bias, clinical deskilling, and overreliance in high-stakes AI decisions, grounded in participant quotes about 'blind trust' risks."
    491     },
    492     "drama_conflict": {
    493       "score": 1,
    494       "justification": "Mild tension around AI-as-assistant vs AI-as-replacement in healthcare, handled carefully without generating controversy."
    495     },
    496     "demo_ability": {
    497       "score": 1,
    498       "justification": "A working prototype exists and prompts are shared in the appendix, but code and data are not released — no public demo available."
    499     },
    500     "brand_recognition": {
    501       "score": 1,
    502       "justification": "LMU Munich and Ludwig Boltzmann Institute are respectable European research institutions; Albrecht Schmidt is a well-known HCI researcher, but no top-tier AI lab affiliation."
    503     }
    504   },
    505   "hn_data": {
    506     "threads": [],
    507     "top_points": 0,
    508     "total_points": 0,
    509     "total_comments": 0
    510   }
    511 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs