scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17755B)
      1 {
      2   "paper": {
      3     "title": "Continuous Software Engineering Practices in AI/ML Development Past the Narrow Lens of MLOps: Adoption Challenges",
      4     "authors": ["Sini Vänskä", "Kai-Kristian Kemell", "Tommi Mikkonen", "Pekka Abrahamsson"],
      5     "year": 2024,
      6     "venue": "e-Informatica Software Engineering Journal",
      7     "doi": "10.37190/e-Inf240102"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code or analysis scripts are released. No repository URL mentioned."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "Interview data is not released. No data download link or supplementary materials provided."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a qualitative interview study with no computational environment to specify."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The interview instrument is provided in Appendix A, but no step-by-step instructions for reproducing the analysis are given."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Qualitative interview study with no quantitative results requiring confidence intervals."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Qualitative study; no statistical comparisons are made."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Qualitative study; no effect sizes applicable."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The authors justify N=8 by citing the exploratory nature of the study and referencing Eisenhardt's recommendation for case study research on novel topics (Section 6.2)."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Qualitative study with no quantitative variance to report."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "Qualitative exploratory study; no baselines applicable."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No baselines applicable for this qualitative study."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No system or method to ablate; this is a qualitative interview study."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No quantitative metrics; qualitative thematic analysis."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No system outputs to evaluate; the study itself is interview-based qualitative research."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No test sets; qualitative study."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by the 13 continuous SE practices and organized into 5 themes (tools/methods, business strategy, development, operations, improvement/innovation). Table 2 shows code occurrences per practice."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper extensively discusses challenges and failures in adopting continuous practices, including practices that were entirely absent (continuous compliance, trust, security — PEC1)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Multiple negative findings reported: continuous compliance/trust/security absent from data (PEC1), SE methods have little impact on ML development (PEC2), ML experts work in silos (PEC3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims challenges were identified in adopting continuous SE practices, which is supported by the 7 PECs in the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes observational claims about challenges and does not assert causal relationships. Language like 'results in' and 'makes it difficult' describe observed associations from interviews, not tested causal mechanisms."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6.2 explicitly acknowledges that N=8 limits generalizability, that convenience sampling is a limitation, and that 3 of 8 organizations were research projects rather than purely industrial contexts."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The discussion section considers alternative factors: COVID-19 remote work affecting collaboration (Section 5.2), project type (external vs internal customers) affecting operations emphasis (PEC6), and domain-specific regulatory factors (PEC1 discussion)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No AI models used in this study; it is a qualitative interview study."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No AI prompting used."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models or computational experiments."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 describes the thematic analysis approach: interviews were transcribed, coded using deductive codes based on the research framework, and analyzed to identify challenges. The coding process and themes are documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6.2 is a dedicated Limitations subsection with substantive discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.2 discusses specific threats: respondents held different roles influencing answers, only one respondent per organization limits understanding, 3 of 8 were research projects not purely industrial, and convenience sampling."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.2 explicitly states what was not tested: they did not focus on specific technologies or project contexts, did not study organizations that specifically claim to use MLOps, and acknowledge their findings are 'less specific' as a result."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Interview transcripts are not made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1 describes data collection: thematic semi-structured interviews, conducted digitally during COVID-19, in Finnish or English, with the interview instrument provided in Appendix A."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 4.1 states convenience sampling was used. Section 6.2 acknowledges this as a limitation. Table 1 lists respondent roles."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 4.1-4.2 describe the pipeline: interviews conducted → transcribed → coded using deductive codes from the framework → themes identified → PECs formulated. Table 2 shows code occurrences."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section states: 'This work was partly funded by local authorities (\"Business Finland\") under grant agreement ITEA-2020-20219-IML4E of the ITEA4 programme.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Deloitte Finland, University of Helsinki, University of Jyväskylä, Tampere University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Business Finland is a public funding agency; the ITEA4 programme is a European research programme. Neither has a financial stake in the study's findings about continuous SE adoption challenges."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper. One author is affiliated with Deloitte, a consulting firm, but no conflict-of-interest declaration addresses this."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "No pre-trained model evaluated on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No pre-trained model evaluated on any benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained model evaluated on any benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration mentioned. The study involves human participants (interviewees)."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned despite conducting interviews with human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Table 1 lists job titles for all 8 respondents. Section 4.1 notes experience ranged from months to decades, and that respondents came from different organizations."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "Beyond 'respondents from different organizations working on AI-related projects' selected via convenience sampling, no explicit inclusion/exclusion criteria are stated."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "Not an experimental study with conditions; this is a qualitative interview study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Not an experimental study; blinding is not applicable to qualitative interviews."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No information on whether any invited participants declined or dropped out. Only the final N=8 is reported."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Qualitative interview study; no computational method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Qualitative interview study; no computational budget applicable."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Continuous compliance, continuous trust, and continuous security were not present in the interview data from ML development organizations.",
    286       "evidence": "Table 2 shows zero code occurrences for these three practices across all 8 respondents (PEC1, Section 5).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "SE methods used by the broader organization have little impact on ML development processes.",
    291       "evidence": "Multiple respondents described not following any SE method strictly; agile adoption was limited to sprints with no deeper integration (PEC2, Section 5.1).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "ML experts often work in a silo, not participating in business strategy activities.",
    296       "evidence": "4 of 8 respondents said ML experts worked independently; 3 felt they did not understand the point of their project; most had little customer interaction (PEC3, Section 5.2).",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Automated testing in ML development remains a challenge, with only 1 of 8 respondents using automated testing via MLOps pipeline.",
    301       "evidence": "Section 5.3 reports only Respondent 7 discussed automated testing; others relied on manual testing (PEC4).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The lack of user and customer interaction makes it difficult for ML experts to ensure continuous use of the product.",
    306       "evidence": "Section 5.4 reports ML experts seldom had direct contact with users; feedback only reached them if something was 'great or terribly wrong' (PEC5).",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["qualitative", "case-study"],
    311   "key_findings": "Through interviews with 8 ML experts from different organizations, the study found that continuous SE practices beyond DevOps/MLOps are poorly adopted in ML development. Key challenges include ML experts working in silos with limited collaboration, SE methods having little impact on ML development processes, automated testing remaining difficult, and lack of user/customer interaction hindering continuous operations. Three continuous practices (compliance, trust, security) were entirely absent from the data.",
    312   "red_flags": [
    313     {
    314       "flag": "Very small sample size",
    315       "detail": "Only 8 respondents from 8 organizations, selected via convenience sampling. Three organizations were research projects rather than purely industrial contexts. The authors acknowledge this limitation but still draw broad conclusions about ML development challenges."
    316     },
    317     {
    318       "flag": "No ethics approval mentioned",
    319       "detail": "The study involves human participants (interviewees) but does not mention IRB or ethics board approval, which is standard practice for interview studies."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "Software engineering for AI-Based systems: A survey",
    325       "authors": ["S. Martínez-Fernández", "J. Bogner", "X. Franch", "M. Oriol", "J. Siebert"],
    326       "year": 2022,
    327       "relevance": "Comprehensive survey on SE challenges for AI-based systems, directly relevant to understanding engineering practices in AI/ML development."
    328     },
    329     {
    330       "title": "A software engineering perspective on engineering machine learning systems: State of the art and challenges",
    331       "authors": ["G. Giray"],
    332       "year": 2021,
    333       "relevance": "Surveys SE challenges in ML system development, a core topic of the survey scope."
    334     },
    335     {
    336       "title": "Adoption and effects of software engineering best practices in machine learning",
    337       "authors": ["A. Serban", "K. van der Blom", "H. Hoos", "J. Visser"],
    338       "year": 2020,
    339       "relevance": "Empirical study on SE best practice adoption rates in ML development teams."
    340     },
    341     {
    342       "title": "Collaboration challenges in building ML-enabled systems: Communication, documentation, engineering, and process",
    343       "authors": ["N. Nahar", "S. Zhou", "G. Lewis", "C. Kästner"],
    344       "year": 2022,
    345       "relevance": "Studies collaboration challenges between ML and SE teams, directly relevant to agentic AI development practices."
    346     },
    347     {
    348       "title": "Data scientists in software teams: State of the art and challenges",
    349       "authors": ["M. Kim", "T. Zimmermann", "R. DeLine", "A. Begel"],
    350       "year": 2017,
    351       "relevance": "Foundational study on integrating data science/ML roles into software teams."
    352     },
    353     {
    354       "title": "Towards MLOps: A framework and maturity model",
    355       "authors": ["M.M. John", "H.H. Olsson", "J. Bosch"],
    356       "year": 2021,
    357       "relevance": "Proposes MLOps framework and maturity model relevant to AI/ML software engineering practices."
    358     },
    359     {
    360       "title": "Understanding development process of machine learning systems: Challenges and solutions",
    361       "authors": ["E. de Souza Nascimento", "I. Ahmed", "E. Oliveira", "M.P. Palheta", "I. Steinmacher"],
    362       "year": 2019,
    363       "relevance": "Studies ML development process challenges, relevant to understanding SE practices in AI development."
    364     }
    365   ]
    366 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs