ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25602B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Human-Centered AI Product Prototyping with No-Code AutoML: Conceptual Framework, Potentials and Limitations",
      6     "authors": [
      7       "Mario Truss",
      8       "Marc Schmitt"
      9     ],
     10     "year": 2024,
     11     "venue": "International Journal of Human Computer Interactions",
     12     "arxiv_id": "2402.07933",
     13     "doi": "10.1080/10447318.2024.2425454"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "The abstract claims the hybrid evaluation approach 'validated the utility' and highlighted 'efficacy in supporting AI non-experts,' but the case study involves a single researcher using one tool on one dataset with no formal non-expert user study to back those claims.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper repeatedly claims NC AutoML 'can improve' and 'addresses challenges' in AI product prototyping, but the design is a single-user, single-tool, single-dataset case study with no control group or comparison, insufficient for causal inference.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The limitations section appropriately bounds scope, but the conclusion states 'this innovation is not confined to any specific sector; rather, it permeates all industries,' far exceeding what a single case study with Google Vertex AI for text classification supports.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper presents a single interpretation—that NC AutoML addresses the identified challenges—without considering alternatives such as other democratization tools, whether the challenges could be solved differently, or why NC AutoML might systematically fail.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper measures qualitative criteria satisfaction by the researchers themselves, then claims this validates 'utility' and 'efficacy in supporting AI non-experts,' conflating researcher-assessed criteria with actual non-expert outcomes.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5.4 is explicitly titled 'Limitations of Research' and lists four specific limitations regarding generalizability.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The limitations section specifically notes that only one AutoML solution (Google Vertex AI), one dataset, and one ML functionality type (classification) were used, and that user acceptance testing with actual AI non-experts was not conducted.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Limitations are stated in Section 5.4, but the conclusion violates them by making industry-wide claims; the stated scope boundaries are not respected throughout the paper.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper explicitly states 'No explicit funding' under the Funder information section.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are disclosed on the title page: Mario Truss at Adobe, Marc Schmitt at Siemens.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No external funder is identified; the work is described as independent research permitted by the employers.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The declaration of interest statement reads: 'The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work.'",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "No-code AutoML is defined as an approach allowing non-experts to develop ML models via a GUI without coding; human-centered AI is grounded through multiple cited definitions; prototyping concepts are discussed through Floyd 1984 and related literature.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper explicitly states its contribution as a conceptual framework (DSR artifact) showing how NC AutoML can integrate into the AI product prototyping process for non-experts, addressing three stated research questions.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 provides a structured literature review of 48 documents, situating the work relative to Wizard-of-Oz prototyping, LLM-based prototyping, and existing AutoML research, explicitly identifying a research gap.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No source code is released; the paper describes using a commercial platform (Google Vertex AI) and presents a conceptual framework without any accompanying code.",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "The 3440-example customer support ticket dataset used in the case study is not released or identified as a public benchmark.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No environment or dependency specifications are provided; the only tool mentioned is Google Vertex AI accessed through its web UI.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No step-by-step reproduction instructions are provided; the case study narrative describes what was done at a high level but is not reproducible without the original dataset and specific platform configuration.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": false,
    146           "answer": false,
    147           "justification": "The evaluation is entirely qualitative (criteria-based), so CIs or error bars are not applicable.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": false,
    152           "answer": false,
    153           "justification": "No comparative quantitative claims are made that would require significance testing; the evaluation is qualitative criteria-based.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": false,
    158           "answer": false,
    159           "justification": "No quantitative outcome comparisons are made, so effect sizes are not applicable.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "The case study uses 3440 training examples with no justification provided for why this number was used or what statistical power it provides.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": false,
    170           "answer": false,
    171           "justification": "No quantitative repeated measurements are taken, making variance reporting inapplicable.",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "No baseline comparison is included; the framework is evaluated against self-derived criteria without comparing it to existing prototyping approaches under the same conditions.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "No baselines are included, so contemporaneity is not applicable.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": false,
    190           "answer": false,
    191           "justification": "The paper evaluates a single framework holistically; no components are isolated for ablation.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "The criteria-based evaluation uses multiple dimensions: three prototyping stages (A1-A3), four product success criteria (B1-B4), and four challenges (C1-C4), plus ML metrics (precision, recall, F1, AUC, ROC) reported by the AutoML tool.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": false,
    203           "justification": "The research question concerns supporting AI non-experts, but no formal multi-participant human evaluation is conducted; only one unnamed individual uses the tool, and no formal user study methodology is applied.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "The research evaluation is of a conceptual framework, not a prediction task; the train/test split mentioned is internal to the AutoML tool, not a research evaluation design.",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Section 4.4 provides per-criterion evaluation across 11 labeled criteria (A1-A3, B1-B4, C1-C4), each discussed separately.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Section 5.1 and 5.4 discuss specific cases where NC AutoML fails or is insufficient: complex data problems, black-box nature, UI limitations for holistic UX testing, and inability to fully address all challenges.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The paper reports that NC AutoML cannot fully resolve complex data problems, has black-box limitations, and cannot support full UX testing without coding knowledge (RQ3 results in Section 5.1).",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": false,
    235           "justification": "Only 'Google Vertex AI' is named with no version number or access date snapshot; this is insufficient to reproduce the case study.",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": false,
    240           "answer": false,
    241           "justification": "The paper does not involve LLM prompting; it uses a GUI-based AutoML platform.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "The only parameter mentioned is the 80/20 train/validation split; algorithm selection, optimization settings, and other hyperparameters are noted as being automated by NC AutoML but not reported.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding is used; the paper evaluates a GUI-based commercial tool.",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": false,
    259           "justification": "Data preprocessing is described only at a high level (CSV upload, class balance check, label correction via UI); no detailed preprocessing steps are documented that would allow reproduction.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": false,
    267           "justification": "The customer support ticket dataset is not released and no public source is identified.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "The dataset is described only as '3440 training examples' for customer support ticket classification; how it was collected, labeled, or sourced is not described.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": true,
    278           "answer": false,
    279           "justification": "The single 'NC AutoML user' with no prior ML knowledge is mentioned but their selection, background, and how they were recruited for the case study are not described.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": false,
    285           "justification": "The data pipeline is partially described (file upload, balance check, label correction) but lacks sufficient detail—no data schema, preprocessing code, or filtering criteria are provided.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "The paper is not evaluating LLM or benchmark task performance; contamination of training data is not applicable.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": false,
    298           "answer": false,
    299           "justification": "Not applicable; the paper is evaluating a conceptual framework, not model generalization on held-out benchmarks.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "No benchmark evaluation of pre-trained models is conducted.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No formal human subjects study is conducted; the one individual using NC AutoML is part of a DSR case study, not a pre-registerable experiment.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No formal human subjects research is conducted requiring ethics review.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No formal participant study is conducted.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No formal participant study is conducted.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No experimental human study is conducted.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No experimental human study is conducted.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No longitudinal human study is conducted.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "Model training took 5 hours on Google Vertex AI, but inference latency or cost is not reported, and no pricing or computational resource data is provided.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "Only training time (5 hours for 3440 examples) is mentioned; no compute budget, cloud costs, or resource allocation is specified.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "NC AutoML can be integrated into all three stages of AI product prototyping (ideation, prototyping, testing).",
    372       "evidence": "Single case study with Google Vertex AI performing text classification; each stage is mapped qualitatively to AutoML features.",
    373       "supported": "weak"
    374     },
    375     {
    376       "claim": "NC AutoML enables evaluation of all four product success criteria (viability, feasibility, usability, desirability) for AI product prototypes.",
    377       "evidence": "Criteria-based qualitative analysis by the authors themselves; no independent user validation.",
    378       "supported": "weak"
    379     },
    380     {
    381       "claim": "NC AutoML addresses the four challenges of AI product prototyping for non-experts (knowledge gaps, boundary objects, unpredictable AI behavior, development complexity).",
    382       "evidence": "Qualitative mapping from case study observations; partially hedged with 'it can be assumed' language throughout.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "NC AutoML delivers comparable results to traditional ML implementation.",
    387       "evidence": "Cited from prior literature (He et al., Calefato et al.) rather than demonstrated in this paper.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The black-box nature of NC AutoML and inability to test the holistic AI UX represent significant limitations.",
    392       "evidence": "Explicitly stated in Section 5.1 and 5.4 as limitations preventing full product validation and human-centricity.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "NC AutoML reduces development complexity by providing a no-code interface, but does not eliminate the need for AI-related knowledge.",
    397       "evidence": "Discussed qualitatively in C4 evaluation: complex data problems still require external expertise beyond NC AutoML's capabilities.",
    398       "supported": "moderate"
    399     }
    400   ],
    401   "methodology_tags": [
    402     "case-study",
    403     "qualitative",
    404     "theoretical"
    405   ],
    406   "key_findings": "This Design Science Research paper develops a conceptual framework for integrating no-code AutoML (demonstrated via Google Vertex AI) into the AI product prototyping process for non-experts. The single case study on text classification (3440 examples, one non-expert user) shows NC AutoML can support ideation, prototyping, and testing stages while providing visibility into model performance metrics and AI behavior. However, the paper identifies significant limitations: NC AutoML cannot fully test the holistic AI user experience without coding, complex data quality issues require external expertise, and the black-box nature creates barriers to production and ensuring human-centricity. No formal user study with multiple participants was conducted, and all validation was performed by the authors against self-derived criteria.",
    407   "red_flags": [
    408     {
    409       "flag": "Single-user case study",
    410       "detail": "The entire empirical validation rests on one unnamed individual using one AutoML platform on one dataset for one ML task; no formal multi-participant user study was conducted despite the research question being about supporting AI non-experts."
    411     },
    412     {
    413       "flag": "Self-referential evaluation",
    414       "detail": "The criteria used to evaluate the framework (A1-A3, B1-B4, C1-C4) were derived by the same researchers who developed the framework, creating circular validation without independent assessment."
    415     },
    416     {
    417       "flag": "Conclusion overshoots evidence",
    418       "detail": "Despite acknowledging single-case limitations in Section 5.4, the conclusion claims the framework 'is not confined to any specific sector; rather, it permeates all industries' — unsupported by the evidence presented."
    419     },
    420     {
    421       "flag": "No actual non-expert user study",
    422       "detail": "The core research claim is about helping AI non-experts, yet no formal user acceptance testing with actual product managers, designers, or other non-experts was conducted; the limitations explicitly acknowledge this gap."
    423     },
    424     {
    425       "flag": "Platform-specific findings",
    426       "detail": "All empirical observations are from Google Vertex AI; cross-platform validation is claimed verbally ('it was further validated that comparable commercial solutions...offer comparable functionalities') without systematic evidence."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "AutoML: A Survey of the State-of-the-Art",
    432       "relevance": "Core AutoML survey used to justify NC AutoML's capabilities and to define the AutoML process referenced throughout the framework."
    433     },
    434     {
    435       "title": "Re-examining Whether, Why, and How Human-AI Interaction Is Uniquely Difficult to Design",
    436       "relevance": "Foundational reference for identifying challenges in AI product prototyping for non-experts, heavily cited across multiple challenge categories."
    437     },
    438     {
    439       "title": "Fits and Starts: Enterprise Use of AutoML and the Role of Humans in the Loop",
    440       "relevance": "Empirical study of AutoML in enterprise settings, cited as evidence for AutoML as a productivity enhancer and human-centered design tool."
    441     },
    442     {
    443       "title": "PromptMaker: Prompt-based Prototyping with Large Language Models",
    444       "relevance": "Represents the LLM-based AI prototyping alternative that the paper situates itself against, from Google Research."
    445     },
    446     {
    447       "title": "ProtoAI: Model-Informed Prototyping for AI-Powered Interfaces",
    448       "relevance": "Related AI prototyping approach combining wireframing with functional model integration, a key comparator in the related work."
    449     },
    450     {
    451       "title": "Democratizing artificial intelligence: How no-code AI can leverage machine learning operations",
    452       "relevance": "Directly related work on no-code AI democratization and citizen development, cited as the closest existing work to the paper's contribution."
    453     },
    454     {
    455       "title": "Automated machine learning: AI-driven decision making in business analytics",
    456       "relevance": "Co-authored by one of the paper's authors; establishes AutoML's role in business analytics and is a key empirical foundation for AutoML performance claims."
    457     },
    458     {
    459       "title": "A Design Science Research Methodology for Information Systems Research",
    460       "relevance": "Methodological foundation for the entire research approach; the DSR methodology is central to the paper's validity claims."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Product managers and designers without ML expertise could directly consult this framework when planning AI prototyping projects using tools like Google Vertex AI or AWS SageMaker Canvas."
    467     },
    468     "surprise_contrarian": {
    469       "score": 0,
    470       "justification": "The finding that NC AutoML can democratize AI prototyping aligns with widely held intuitions and vendor marketing; no surprising or counter-intuitive results are presented."
    471     },
    472     "fear_safety": {
    473       "score": 1,
    474       "justification": "Briefly raises concerns about data bias, privacy, and AI reliability risks in citizen development, but these are treated as secondary considerations rather than the paper's focus."
    475     },
    476     "drama_conflict": {
    477       "score": 0,
    478       "justification": "No controversy or conflict angle; the paper is supportive of NC AutoML adoption with no significant pushback against any established position."
    479     },
    480     "demo_ability": {
    481       "score": 1,
    482       "justification": "Google Vertex AI, AWS SageMaker Canvas, and Azure AutoML are publicly accessible platforms that practitioners could try, but the paper provides no linked demo or replication package."
    483     },
    484     "brand_recognition": {
    485       "score": 1,
    486       "justification": "Authors are from Adobe and Siemens, and the case study uses Google Vertex AI, providing some brand recognition, but none of these are the primary research output."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [
    491       {
    492         "hn_id": "39453382",
    493         "title": "UFO: A UI-Focused Agent for Windows OS Interaction",
    494         "points": 1,
    495         "comments": 0,
    496         "url": "https://news.ycombinator.com/item?id=39453382"
    497       }
    498     ],
    499     "top_points": 1,
    500     "total_points": 1,
    501     "total_comments": 0
    502   }
    503 }

Impressum · Datenschutz