ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24298B)


      1 {
      2   "paper": {
      3     "title": "Exploring AI-Augmented Sensemaking of Patient-Generated Health Data: A Mixed-Method Study with Healthcare Professionals in Cardiac Risk Reduction",
      4     "authors": [
      5       "Pavithren V. S. Pakianathan",
      6       "Rania Islambouli",
      7       "Diogo Branco",
      8       "Albrecht Schmidt",
      9       "Tiago Guerreiro",
     10       "Jan David Smeddinck"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.05687"
     15   },
     16   "scan_version": 2,
     17   "active_modules": [],
     18   "methodology_tags": ["qualitative", "observational"],
     19   "key_findings": "In a mixed-methods study with 16 HCPs, LLM-generated summaries of patient-generated health data were perceived as reducing information overload and anchoring data exploration, though workload differences (NASA-TLX) were not statistically significant. Conversational interfaces were valued for bridging data literacy gaps and enabling personalized analysis without data science expertise. HCPs raised concerns about overreliance, deskilling, privacy, and transparency, and broadly viewed AI as an assistant rather than replacement. Trust in AI summaries correlated with confidence in final plans (Spearman r=.46, p=.001).",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No repository URL or code archive is provided in the paper. The prototype was built using Plotly Dash but no source code is released."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The synthetic PGHD, personas, and collected study data are not released. No data download links are provided."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions Python and the Plotly Dash framework but provides no requirements.txt, dependency versions, or environment specification."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No reproduction instructions are provided. The paper describes the study apparatus but does not include steps to replicate the system or study."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Results report means and SDs (e.g., SUS AI mean=90.63, SD=8.44) but no confidence intervals or error bars. The Spearman correlation is reported with a p-value but no CI."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Wilcoxon signed-rank tests are used for paired comparisons of NASA-TLX and SUS scores. Spearman correlations are used for trust-confidence associations. Linear mixed-effects models are also mentioned for robustness verification."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports Spearman r=.46 for trust-confidence correlation, and provides raw means enabling comparison (e.g., NASA-TLX AI=24.53 vs No-AI=27.40, a ~3.9-point reduction). SUS scores and confidence ratings are reported with means providing baseline context."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The sample of 16 HCPs is not justified with a power analysis. The limitations section acknowledges the sample was 'underpowered for detecting small or medium effects' but no a priori justification is given."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Standard deviations are reported alongside means for all quantitative measures (e.g., SUS AI: SD=8.44, NASA-TLX AI: SD=11.99, confidence AI: SD=0.67, trust: SD=0.98)."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The study uses a within-subjects design comparing AI Summary vs No-AI Summary conditions, providing a direct baseline comparison."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The baseline (no AI summary, charts only) is appropriate for this formative study comparing AI-augmented vs non-augmented sensemaking of the same data."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is not a system with separable components to ablate. It is an HCI study comparing two conditions (with/without AI summaries) plus a conversational interface exploration."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple measures are used: NASA-TLX (workload, 6 subscales), SUS (usability), self-assessed confidence, trust in AI summaries, MiniVLAT (visualization literacy), plus qualitative thematic analysis."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The entire study IS a human evaluation — 16 HCPs reviewed the system, rated it on multiple scales, and provided qualitative feedback through semi-structured interviews."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a user study, not a benchmark evaluation. There is no train/test split."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "NASA-TLX subscale breakdowns are provided (Figure 6 spider chart). Qualitative themes are broken down by category (sensemaking value, workflow augmentation, risks/tensions). Provenance analysis is broken down by modality (Tables 4, 5)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "P12 identified a blood pressure classification error in one summary. The paper discusses concerns about accuracy, overreliance, and deskilling. The provenance analysis reveals MAPD variability across modalities."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper honestly reports that workload differences were not statistically significant, SUS differences were not significant, and that confidence was comparable across conditions (AI: M=4.38 vs No-AI: M=4.27)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims are hedged appropriately ('Findings show that AI summaries provided quick overviews', 'HCPs raised concerns about transparency, privacy, and overreliance'). These match the qualitative and quantitative findings in the results section."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper is carefully formative and exploratory. It uses language like 'perceived', 'valued', 'suggested' rather than causal claims. The within-subjects comparison is appropriate for the claims made. The paper explicitly states it does not 'evaluate clinical effectiveness.'"
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly states findings should be 'interpreted as insights into perceptions, practices, and sensemaking dynamics rather than as evaluations of clinical performance.' The Limitations section bounds scope to controlled conditions, single-user sessions, synthetic data, and the specific clinical domain."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The paper discusses novelty effects (mitigated via demo mode), notes that lack of time pressure may have muted quantitative differences, discusses that synthetic data may not capture real-world variability, and acknowledges heterogeneity in human-AI collaboration effects (citing Vaccaro et al.)."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly frames this as studying 'perceptions, workflow fit, and sociotechnical considerations' rather than clinical effectiveness. It distinguishes between perceived value (what they measured) and actual clinical impact (not claimed). The paper states 'Our aim is not to evaluate clinical effectiveness.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper specifies 'GPT-4-Turbo' for data generation and the appendix provides the model name. Temperature (0.5) and max tokens (1024) are specified in the appendix prompt configuration."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Full prompt texts are provided in Appendix A.2 for all four data modalities (physical activity, sedentary time, blood pressure, sleep) and the combined data prompt. The prompts include role descriptions, formatting instructions, column definitions, and data passed."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The appendix specifies: Model: gpt-4-turbo, Temperature: 0.5, Max tokens: 1024."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The system makes direct LLM API calls for summary generation and conversational interaction without agent loops, tool use, or retry logic."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 3.2 describes synthetic data generation: persona design with SCORE2 risk stratification, building on Henriksen et al. dataset, generated with ChatGPT using Python randomization functions, one year of multimodal PGHD across four modalities. Data verified by lead researcher and two HCPs."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5.4 is a dedicated 'Limitation' section with substantial discussion of multiple limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats include: LLM not pre-trained on sensor data so summaries may be inaccurate, no strict time limits muting efficiency gains, N=16 underpowered for non-parametric tests, synthetic rather than real PGHD, HCPs working alone rather than with patients present, study conducted in non-native English language."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly states: 'Our aim is not to evaluate clinical effectiveness, but to generate insights into acceptable and effective interaction patterns.' It states findings reflect 'controlled conditions rather than deployment realities' and that triadic patient-HCP-AI interactions were not captured."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Neither the synthetic PGHD, questionnaire responses, interview transcripts, nor chat logs are released for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 3.3 (Procedure) details the 75-minute session structure: demographics questionnaire, MiniVLAT, demo familiarization, two experimental conditions with randomized order, conversational interface interaction, and semi-structured interview. Section 3.6 describes all instruments used."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 3.4 states: 'We recruited participants via an email publicity sent to HCPs working in cardiac care at a university hospital, which was subsequently shared within their professional networks.'"
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The study procedure is documented step by step (Figure 5). Audio transcription used OpenAI Whisper, verified by first author. Qualitative coding: two authors co-coded 4 participants, then first author coded remaining. Quantitative analysis methods specified (Wilcoxon, Spearman, linear mixed-effects). Post-hoc provenance analysis of LLM outputs is documented."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding disclosure, acknowledgments section, or grant information is found in the paper. Authors are affiliated with Ludwig Boltzmann Institute, LMU Munich, and LASIGE but no funding source is stated."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Ludwig Boltzmann Institute for Digital Health and Prevention (Salzburg), LMU Munich, and LASIGE/Universidade de Lisboa."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding source is disclosed, so independence cannot be assessed."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It is a user study evaluating HCPs' sensemaking with an LLM-augmented interface."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Not a benchmark evaluation. The study evaluates HCP perceptions, not model performance on test data."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Not a benchmark evaluation study."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No pre-registration link or mention of pre-registration is found in the paper."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "Section 3.5 states: 'Our study protocol received official approval from the relevant institutional ethics committee (blinded for review) prior to data collection.'"
    259       },
    260       "demographics_reported": {
    261         "applies": true,
    262         "answer": true,
    263         "justification": "Section 3.4 reports: 12 women, 4 men; M=31.4 years, SD=5.0; specialized in cardiovascular rehabilitation; M=9.1 years experience, SD=5.5, range 2-20; 12 regularly used PGHD; MiniVLAT M=9.47/12; AI usage patterns detailed."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "The paper states participants were HCPs in cardiac care recruited via email but does not specify formal inclusion/exclusion criteria. It notes none participated in prior co-design activities, but this is the only stated criterion."
    269       },
    270       "randomization_described": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "Section 3.3 states conditions were presented in 'randomized order.' Personas were 'stratified by CVD risk: medium, high, very high' with 'three unique personas' per condition block. Figure 5 shows the randomized structure."
    274       },
    275       "blinding_described": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "No blinding is described. The AI vs No-AI conditions were visually distinct (presence/absence of summaries), and participants were aware of which condition they were in. This is acknowledged implicitly but not discussed as a limitation."
    279       },
    280       "attrition_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No attrition or dropout information is reported. The paper simply states 16 HCPs participated without mentioning whether any were excluded or dropped out."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No API costs, token counts, or inference latency for the GPT-4-Turbo calls are reported, despite the system making multiple LLM calls per session."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No computational budget is stated for the LLM usage in the study."
    296       }
    297     }
    298   },
    299   "claims": [
    300     {
    301       "claim": "AI summaries reduced perceived time and effort for data sensemaking",
    302       "evidence": "15 of 16 participants reported summaries were valuable in reducing information overload. NASA-TLX showed ~3.9-point lower workload (AI: 24.53 vs No-AI: 27.40) but difference was not statistically significant (Section 4.1.1). Qualitative quotes from P12, P2, P13, P6 support perceived value.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "AI summaries served as anchors for data exploration, changing how HCPs approached data review",
    307       "evidence": "Qualitative findings in Section 4.1.2 describe HCPs beginning with summaries rather than scanning charts, offloading mental effort. Quotes from P5 and P2 support this pattern.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Conversational interfaces bridged data literacy gaps for HCPs",
    312       "evidence": "Section 4.2.3 provides quotes from P5 and P4 describing how the chatbot enabled analyses they could not perform themselves. P4 specifically notes 'we don't have that knowledge' regarding data analysis skills.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Higher trust in AI summaries correlated with greater confidence in final physical activity plans",
    317       "evidence": "Spearman r=.46, p=.001 (Section 4.3.2). Trust M=4.27, SD=0.98; confidence AI M=4.38, SD=0.67.",
    318       "supported": "strong"
    319     },
    320     {
    321       "claim": "LLM-generated statistics were broadly accurate with MAPD of 3.96% for holistic insights and 2.68% for chat logs",
    322       "evidence": "Post-hoc provenance analysis in Section 3.1 comparing LLM-derived averages against ground-truth values. Tables 4 and 5 provide modality-specific breakdowns. All 25 checked ranges were accurate.",
    323       "supported": "strong"
    324     },
    325     {
    326       "claim": "AI augmentation did not significantly increase workload or reduce usability",
    327       "evidence": "SUS scores: AI=90.63 vs No-AI=85.94, no significant difference. NASA-TLX: AI=24.53 vs No-AI=27.40, no significant difference on Wilcoxon signed-rank tests (Section 4.1.1).",
    328       "supported": "moderate"
    329     }
    330   ],
    331   "red_flags": [
    332     {
    333       "flag": "Small sample underpowered for quantitative claims",
    334       "detail": "N=16 is acknowledged as underpowered for detecting small or medium effects. The non-significant quantitative results may reflect insufficient power rather than absence of effect, yet the paper draws conclusions about workload and usability from these comparisons."
    335     },
    336     {
    337       "flag": "No blinding in within-subjects design",
    338       "detail": "Participants knew whether they were in the AI or No-AI condition (visible presence/absence of summaries). Demand characteristics and novelty effects could inflate perceived value of the AI condition. The demo familiarization mitigates but does not eliminate this concern."
    339     },
    340     {
    341       "flag": "Synthetic data limits ecological validity",
    342       "detail": "All PGHD was synthetically generated, and the study used personas rather than real patients. The paper acknowledges this but still draws design implications for real clinical workflows."
    343     },
    344     {
    345       "flag": "Recruitment from single institution",
    346       "detail": "All 16 HCPs were recruited from a single university hospital and its professional networks, potentially limiting diversity of perspectives. The sample skews young (23-42 years) and female (12/16)."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    352       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    353       "year": 2025,
    354       "relevance": "RCT measuring AI coding tool impact on developer productivity — directly relevant to AI programming productivity evidence base."
    355     },
    356     {
    357       "title": "When combinations of humans and AI are useful: A systematic review and meta-analysis",
    358       "authors": ["Michelle Vaccaro", "Abdullah Almaatouq", "Thomas Malone"],
    359       "year": 2024,
    360       "doi": "10.1038/s41562-024-02024-1",
    361       "relevance": "Meta-analysis on human-AI collaboration effectiveness showing heterogeneous effects — key evidence on whether AI augmentation actually helps."
    362     },
    363     {
    364       "title": "Clinician-Facing AI in the Wild: Taking Stock of the Sociotechnical Challenges and Opportunities for HCI",
    365       "authors": ["Hubert D. Zając", "Dana Li", "Xiang Dai"],
    366       "year": 2023,
    367       "doi": "10.1145/3582430",
    368       "relevance": "Survey of sociotechnical challenges in clinical AI deployment, relevant to understanding real-world AI integration barriers."
    369     },
    370     {
    371       "title": "Vital Insight: Assisting Experts' Context-Driven Sensemaking of Multi-modal Personal Tracking Data Using Visualization and Human-In-The-Loop LLM Agents",
    372       "authors": ["Jiachen Li", "Xiwen Li", "Justin Steinberg"],
    373       "year": 2025,
    374       "arxiv_id": "2410.14879",
    375       "relevance": "Closely related work on LLM agents for expert sensemaking of multimodal personal tracking data."
    376     },
    377     {
    378       "title": "Narrating Fitness: Leveraging Large Language Models for Reflective Fitness Tracker Data Interpretation",
    379       "authors": ["Konstantin R. Strömel", "Stanislas Henry", "Tim Johansson"],
    380       "year": 2024,
    381       "doi": "10.1145/3613904.3642032",
    382       "relevance": "Prior work on LLMs generating narratives from fitness tracker data — direct precursor to this study's approach."
    383     },
    384     {
    385       "title": "MindfulDiary: Harnessing Large Language Model to Support Psychiatric Patients' Journaling",
    386       "authors": ["Taewan Kim", "Seolyeong Bae", "Hyun Ah Kim"],
    387       "year": 2024,
    388       "doi": "10.1145/3613904.3642937",
    389       "relevance": "LLM application in mental health data sensemaking, demonstrating AI-generated summaries for clinical insights."
    390     },
    391     {
    392       "title": "From Classification to Clinical Insights: Towards Analyzing and Reasoning About Mobile and Behavioral Health Data With Large Language Models",
    393       "authors": ["Zachary Englhardt", "Chengqian Ma", "Margaret E. Morris"],
    394       "year": 2024,
    395       "doi": "10.1145/3659604",
    396       "relevance": "LLMs for analyzing mobile and behavioral health data — directly relevant to AI-augmented clinical data interpretation."
    397     },
    398     {
    399       "title": "Adapted large language models can outperform medical experts in clinical text summarization",
    400       "authors": ["Dave Van Veen"],
    401       "year": 2024,
    402       "relevance": "Evidence that LLMs can outperform medical experts in summarization tasks, relevant to AI capability claims in healthcare."
    403     },
    404     {
    405       "title": "SensorLM: Learning the Language of Wearable Sensors",
    406       "authors": ["Yuwei Zhang", "Kumar Ayush", "Siyuan Qiao"],
    407       "year": 2025,
    408       "arxiv_id": "2506.09108",
    409       "relevance": "Fine-tuned model translating wearable sensor data into human-readable narratives — technical approach for PGHD interpretation."
    410     }
    411   ]
    412 }

Impressum · Datenschutz