scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25671B)
      1 {
      2   "paper": {
      3     "title": "Human-Centered AI Product Prototyping with No-Code AutoML: Conceptual Framework, Potentials and Limitations",
      4     "authors": ["Mario Truss", "Marc Schmitt"],
      5     "year": 2024,
      6     "venue": "International Journal of Human-Computer Interaction",
      7     "arxiv_id": "2402.07933",
      8     "doi": "10.1080/10447318.2024.2425454"
      9   },
     10   "scan_version": 3,
     11   "active_modules": [],
     12   "methodology_tags": ["case-study", "qualitative"],
     13   "key_findings": "The paper proposes a conceptual framework for integrating no-code AutoML into the AI product prototyping process, targeting AI non-experts. A single case study using Google Vertex AI for customer support ticket classification (3,440 examples) demonstrates that NC AutoML can support ideation, prototyping, and testing stages while providing insights into viability, feasibility, usability, and desirability. The authors identify limitations including inability to handle complex data problems, black-box model opacity, and restricted UI testing capabilities.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code or repository is released. The paper presents a conceptual framework and a case study walkthrough of Google AutoML but provides no code artifacts."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The case study uses a dataset of 3,440 customer support training examples, but this dataset is not released or made publicly available."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment or dependency specifications are provided. The paper only mentions using Google Vertex AI without version information or technical setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The case study is described narratively but lacks sufficient detail for replication (e.g., exact AutoML configuration, data format specifics)."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper presents no quantitative experiments by the authors. The ML metrics shown (precision, recall, F1) are screenshots from the Google AutoML UI, not the paper's own experimental results requiring uncertainty quantification."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "The paper makes no comparative quantitative claims requiring significance tests. The evaluation is entirely qualitative and criteria-based."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative measurements are reported by the authors that would require effect size reporting. The paper's contribution is a conceptual framework evaluated qualitatively."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses a single case study with one NC AutoML tool, one dataset (3,440 examples), and one use case (classification) but does not justify why this sample is sufficient for the claims made about the framework's general utility."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted by the authors. The single case study walkthrough does not involve repeated measurements."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper discusses existing prototyping approaches in Section 2.5 (Wizard of Oz, LLM prompting, design thinking) but does not perform a head-to-head comparison or systematic baseline evaluation. The criteria-based evaluation in Section 4.4 is self-referential."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines are included in the evaluation, so contemporaneity cannot be assessed."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The framework is a single conceptual artifact without separable components that could be ablated."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The evaluation uses qualitative criteria (A1-A3, B1-B4, C1-C4) but no quantitative metrics. The ML metrics (precision, recall, F1) shown in the case study are from Google AutoML's output, not the paper's evaluation of its own contribution."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper claims the framework supports AI non-experts but includes no user study or human evaluation of the framework itself. The case study was conducted by the authors, not independent AI non-experts."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "The paper does not perform quantitative evaluation with training/test splits for its own contribution. The 80/20 split mentioned is internal to Google AutoML's model training in the case study."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.4 provides a detailed breakdown of evaluation results across integration in prototyping stages (A1-A3), product success criteria (B1-B4), and challenges addressed (C1-C4), with separate discussion for each criterion."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.1 discusses specific limitations: NC AutoML cannot handle complex data acquisition, custom model development, or full UI integration. Section 5.3 discusses ethical risks. Section 5.4 lists research limitations including single tool, single dataset, single use case."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports multiple things that did not work well: black-box nature of AutoML models, inability to test holistic AI experience without coding, limited data engineering capabilities, and inability to fully validate human-centricity (Section 5.1)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims are reasonably hedged: 'highlighting its efficacy in supporting AI non-experts and streamlining decision-making and its limitations.' The paper does present both positive and negative findings in Section 4.4 and Section 5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims throughout: 'NC AutoML has been proven to be a valuable tool' (Section 5.1), 'NC AutoML can improve the AI product prototyping process' (Section 7). These claims rest on a single illustrative case study with no controlled comparison, inadequate for causal inference."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The conclusion claims 'This innovation is not confined to any specific sector; rather, it permeates all industries' (Section 7) based on one case study with one tool (Google Vertex AI), one use case (text classification), and one dataset. Section 5.4 acknowledges limitations but the framing in Sections 5.2 and 7 far exceeds the evidence."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed for why the case study demonstrated positive results. For example, the success might be attributable to the simplicity of the text classification task, the quality of the specific dataset, or Google Vertex AI's particular strengths rather than NC AutoML in general."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures whether NC AutoML provides certain features (UI guidance, preview capability, performance metrics) and frames this as 'utility' and 'validity' of the framework for improving the prototyping process. No acknowledgment that the presence of features does not equal actual improvement in prototyping outcomes for non-experts."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper mentions 'Google Vertex AI' and 'Google AutoML' but provides no version information, API version, or date of access beyond 'Accessed: Jan. 01, 2024' in the reference."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. The NC AutoML approach uses a GUI-based workflow, not prompt-based interaction."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The only parameter mentioned is the 80% train / 20% validation-test split. No other hyperparameters are reported. The paper states 'AutoML required 5 hours to train' but provides no detail on what configurations or algorithms were used internally."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper evaluates a GUI-based AutoML tool."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper states 'We used a dataset with 3440 training examples' uploaded as CSV, with data previewed and class balance checked, but provides no detail on how the original data was collected, cleaned, or prepared for upload."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.4 'Limitations of Research' is a dedicated subsection listing four specific limitations of the study."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5.4 lists specific threats: the case study used only Google Vertex AI for classification, only one dataset and one ML functionality type, and identifies four concrete steps needed for generalizability (multiple providers, multiple use cases, user acceptance testing, multiple AI use cases)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5.4 explicitly states what was NOT tested and what future research should address: multiple AutoML providers, multiple ML use cases, user acceptance testing with actual AI non-experts, and multiple AI use cases."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The customer support dataset of 3,440 examples used in the case study is not made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The paper states only 'We used a dataset with 3440 training examples' for customer request classification. No description of how the data was collected, from which source, over what time period, or what the data characteristics are beyond the number of examples."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants in the study. The case study user is the author. The data source is customer support tickets, not a recruited sample."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of how the 3,440 customer support tickets were collected, filtered, labeled, or prepared before upload to Google AutoML. The paper goes directly from 'we used a dataset' to demonstrating the AutoML workflow."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The paper explicitly states 'No explicit funding' under 'Funder information' and acknowledges employers (Adobe and Siemens) in the acknowledgments section."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Mario Truss (Adobe) and Marc Schmitt (Siemens). The acknowledgments thank 'their employer for allowing them to do independent research.'"
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The authors' employers (Adobe and Siemens) do not make the specific product being evaluated (Google Vertex AI). Neither company has a direct financial stake in the paper's findings about NC AutoML for prototyping."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The Declaration of Interest Statement reads: 'The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this article.'"
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It demonstrates a no-code AutoML tool for prototyping purposes."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper does not evaluate a pre-trained model on any benchmark. The AutoML model trained in the case study uses the authors' own proprietary dataset."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is performed. The paper demonstrates an AutoML workflow, not model capability on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study. The case study was conducted by the authors themselves."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The case study uses the author as the NC AutoML user, not recruited participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants and no experimental conditions requiring randomization."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants and no experimental conditions requiring blinding."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No cost information is provided for using Google Vertex AI. The paper mentions '5 hours to train the ML model' but does not report any monetary cost, API costs, or compute resource details."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Only 'AutoML required 5 hours to train the ML model' is mentioned. No GPU/compute specifications, total API spend, or hardware details are provided."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "NC AutoML can be integrated into all three stages of the AI product prototyping process (ideation, prototyping, and testing).",
    296       "evidence": "Criteria-based evaluation in Section 4.4 (A1-A3) describes how Google AutoML supports ideation through documentation, prototyping through automated model training via UI, and testing through the preview function.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "NC AutoML provides insights into all four product success criteria: viability, feasibility, usability, and desirability.",
    301       "evidence": "Sections B1-B4 in Section 4.4 describe how AutoML metrics, confusion matrices, and preview UIs can inform each criterion. Based on features observed in the Google Vertex AI case study.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "NC AutoML solves or improves the four identified challenges in AI product prototyping (knowledge gaps, boundary objects, unpredictable AI, development complexity).",
    306       "evidence": "Sections C1-C4 in Section 4.4 argue that AutoML UI guidance bridges knowledge gaps (C1), acts as a boundary object (C2), creates transparency about AI capabilities (C3), and reduces development complexity (C4).",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "AutoML can create comparable results to manual ML development.",
    311       "evidence": "Referenced from prior literature in Section 2.5: 'they provide proof that AutoML can create comparable results to manual ML' citing references [39], [40], [42], [46], [79]-[82]. No original evidence provided in this paper.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "NC AutoML can reduce misinvestments in AI product ideas that are not feasible for productive development.",
    316       "evidence": "Section 5.1 argues this based on the ability to evaluate AI product ideas quickly with real data, but the claim rests on the single case study with no measurement of actual investment outcomes.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Self-referential evaluation design",
    323       "detail": "The evaluation criteria (A1-A3, B1-B4, C1-C4) were designed by the same authors who created the framework and conducted the case study. There is no independent validation or external evaluation by actual AI non-experts."
    324     },
    325     {
    326       "flag": "Claims significantly outrun evidence",
    327       "detail": "The conclusion states 'This innovation is not confined to any specific sector; rather, it permeates all industries' based on a single case study with one tool (Google Vertex AI), one dataset (3,440 customer support tickets), and one use case (text classification). The language of 'proven' (Section 5.1) is inappropriate for the evidence level."
    328     },
    329     {
    330       "flag": "No actual user study despite claims about non-experts",
    331       "detail": "The paper's central claim is about supporting AI non-experts, yet the case study was conducted by the authors themselves. The paper states 'The NC AutoML user had no prior knowledge of ML and no coding capabilities' but does not clarify whether this refers to one of the authors or an external participant, and no user acceptance testing was conducted."
    332     },
    333     {
    334       "flag": "Single case study generalization",
    335       "detail": "All framework validation relies on one case study with Google Vertex AI for text classification. While Section 5.4 acknowledges this limitation, the framework is nonetheless presented as validated and generalizable throughout Sections 5 and 7."
    336     },
    337     {
    338       "flag": "Conflict of interest not fully explored",
    339       "detail": "The first author works at Adobe, which offers AI platform products (Adobe Sensei is mentioned in Section 2.3). While they evaluate a Google product, the general finding that NC AutoML is valuable for prototyping serves the interests of all major AI platform providers including the authors' employer."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "AutoML: A Survey of the State-of-the-Art",
    345       "authors": ["X. He", "K. Zhao", "X. Chu"],
    346       "year": 2021,
    347       "doi": "10.1016/j.knosys.2020.106622",
    348       "relevance": "Comprehensive survey of AutoML techniques and capabilities, relevant to understanding automated ML development tools."
    349     },
    350     {
    351       "title": "PromptMaker: Prompt-based Prototyping with Large Language Models",
    352       "authors": ["E. Jiang"],
    353       "year": 2022,
    354       "doi": "10.1145/3491101.3503564",
    355       "relevance": "Explores using LLMs for AI product prototyping through prompts, directly relevant to LLM-based development tools."
    356     },
    357     {
    358       "title": "PromptChainer: Chaining Large Language Model Prompts through Visual Programming",
    359       "authors": ["T. Wu"],
    360       "year": 2022,
    361       "doi": "10.1145/3491101.3519729",
    362       "relevance": "Introduces LLM chaining for prototyping, relevant to agentic workflow composition and multi-step LLM pipelines."
    363     },
    364     {
    365       "title": "PromptInfuser: Bringing User Interface Mock-ups to Life with Large Language Models",
    366       "authors": ["S. Petridis", "M. Terry", "C. J. Cai"],
    367       "year": 2023,
    368       "doi": "10.1145/3544549.3585628",
    369       "relevance": "Integrates LLM responses into UI prototypes, relevant to AI product prototyping and LLM application development."
    370     },
    371     {
    372       "title": "ProtoAI: Model-Informed Prototyping for AI-Powered Interfaces",
    373       "authors": ["H. Subramonyam", "C. Seifert", "E. Adar"],
    374       "year": 2021,
    375       "doi": "10.1145/3397481.3450640",
    376       "relevance": "Proposes a framework for prototyping AI interfaces with functional models, directly relevant to AI product development."
    377     },
    378     {
    379       "title": "Vulnerabilities in AI Code Generators: Exploring Targeted Data Poisoning Attacks",
    380       "authors": ["D. Cotroneo", "C. Improta", "P. Liguori", "R. Natella"],
    381       "year": 2023,
    382       "doi": "10.48550/arXiv.2308.04451",
    383       "relevance": "Explores security vulnerabilities in AI code generation systems, relevant to AI safety and code quality concerns."
    384     },
    385     {
    386       "title": "Fits and Starts: Enterprise Use of AutoML and the Role of Humans in the Loop",
    387       "authors": ["A. Crisan", "B. Fiore-Gartland"],
    388       "year": 2021,
    389       "relevance": "Studies enterprise adoption of AutoML and human-in-the-loop dynamics, relevant to AI productivity tools and human-AI collaboration."
    390     },
    391     {
    392       "title": "Advances, challenges and opportunities in creating data for trustworthy AI",
    393       "authors": ["W. Liang"],
    394       "year": 2022,
    395       "doi": "10.1038/s42256-022-00516-1",
    396       "relevance": "Addresses data quality challenges for trustworthy AI development, relevant to AI safety and data integrity."
    397     },
    398     {
    399       "title": "Six Human-Centered Artificial Intelligence Grand Challenges",
    400       "authors": ["O. Ozmen Garibay"],
    401       "year": 2023,
    402       "doi": "10.1080/10447318.2022.2153320",
    403       "relevance": "Defines key challenges in human-centered AI, providing context for AI product development research."
    404     },
    405     {
    406       "title": "Democratizing artificial intelligence: How no-code AI can leverage machine learning operations",
    407       "authors": ["L. Sundberg", "J. Holmström"],
    408       "year": 2023,
    409       "doi": "10.1016/j.bushor.2023.04.003",
    410       "relevance": "Directly addresses AI democratization through no-code tools and MLOps, relevant to AI accessibility and tool evaluation."
    411     },
    412     {
    413       "title": "Accelerating Innovation With Generative AI: AI-Augmented Digital Prototyping and Innovation Methods",
    414       "authors": ["V. Bilgram", "F. Laarmann"],
    415       "year": 2023,
    416       "doi": "10.1109/EMR.2023.3272799",
    417       "relevance": "Explores generative AI for digital prototyping, relevant to LLM-based development tools and AI-augmented workflows."
    418     }
    419   ],
    420   "engagement_factors": {
    421     "practical_relevance": {
    422       "score": 2,
    423       "justification": "The framework provides a conceptual guide for product managers wanting to use NC AutoML for prototyping, but no immediately usable tool or technique."
    424     },
    425     "surprise_contrarian": {
    426       "score": 0,
    427       "justification": "The finding that no-code tools can help non-experts prototype AI products confirms widely-held expectations about low-code/no-code democratization."
    428     },
    429     "fear_safety": {
    430       "score": 0,
    431       "justification": "The paper briefly mentions ethical risks and black-box concerns but raises no novel safety or security concerns."
    432     },
    433     "drama_conflict": {
    434       "score": 0,
    435       "justification": "No controversy, no challenges to existing claims or tools."
    436     },
    437     "demo_ability": {
    438       "score": 1,
    439       "justification": "Google Vertex AI is publicly available and someone could follow along, but the paper provides no code, demo, or reproducible artifact."
    440     },
    441     "brand_recognition": {
    442       "score": 1,
    443       "justification": "Authors from Adobe and Siemens, uses Google AutoML — known brands but not the most prominent AI labs."
    444     }
    445   }
    446 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs