scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21230B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Generative AI for Software Architecture. Applications, Trends, Challenges, and Future Directions",
      6     "authors": [
      7       "Matteo Esposito",
      8       "Xiaozhou Li",
      9       "Sergio Moreschini",
     10       "Noman Ahmad",
     11       "Tomás Cerný"
     12     ],
     13     "year": 2025,
     14     "venue": "Journal of Systems and Software",
     15     "arxiv_id": "2503.13310",
     16     "doi": "10.48550/arXiv.2503.13310"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims about architectural decision support dominance (38%, Table 11), GPT model prevalence (62%, Table 12), few-shot prompting and RAG usage (Table 13), initial SALC stages focus (Table 14), monolithic/microservice targets (Table 15), and missing rigorous testing (93%, Table 18) are all supported by the results tables.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper makes descriptive claims about the state of research ('GenAI has been applied mostly to...', 'OpenAI GPT models are predominantly applied'). No causal claims about what causes or improves architectural outcomes are made.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Generative AI for Software Architecture: Applications, Challenges, and Future Directions' and conclusions like 'GenAI shows significant potential in software design' are broad, but the evidence comes from only 46 papers (36 peer-reviewed, 10 gray literature) searched through February 2025. The paper does not explicitly acknowledge that 46 papers may not represent the full landscape, especially given the rapidly evolving field.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for its findings. For example, the dominance of OpenAI GPT (62%) could reflect publication bias, marketing, or API accessibility rather than actual superiority or preference. The paper presents percentages at face value without considering confounds in what gets published.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper measures what is reported in published literature and frames it as how GenAI 'is utilized' in software architecture. The gap between publication patterns and actual industry practice is not explicitly discussed. The inclusion of gray literature partially addresses this but the proxy gap is not acknowledged.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats to Validity' provides a dedicated discussion structured by construct, internal, external, and conclusion validity, following Wohlin et al.'s framework.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 6 discusses specific threats: subjective analysis mitigated by dual-author extraction with third-author arbitration, potential non-inclusion of studies mitigated by searching eight digital libraries plus snowballing, inability to evaluate external validity of all included studies, and specific mention of applying inclusion/exclusion criteria to both title/abstract and full text.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. The search is bounded temporally (after March 2022) and by topic (GenAI in software architecture), but the paper makes no explicit statements about what claims it is NOT making or what populations/settings are excluded from its conclusions.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The Acknowledgment section lists: 'Business Finland Project 6GSoft, Academy of Finland project MUFANO/349488, and National Science Foundation (NSF) Grant No. 2409933.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: University of Oulu (Finland), Tampere University (Finland), University of Arizona (USA), and IIIT Hyderabad (India). None of the authors are affiliated with GenAI product companies.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders are public research agencies (Business Finland, Academy of Finland, NSF) with no financial interest in any particular GenAI product or outcome.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "GenAI is defined in the introduction, the Software Architecture Life Cycle (SALC) phases are explicitly enumerated and operationalized in Table 14, and the MLR methodology is defined by citation to Garousi et al.; key terms are grounded in the data extraction framework (Table 6).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction explicitly enumerates five bullet-point contributions: a comprehensive synthesis, a GenAI model classification, identification of common applications and challenges, identification of research gaps, and bridging industry-research via gray literature.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 includes Table 1 comparing this work against nine prior systematic studies, explicitly stating how this MLR differs by focusing specifically on GenAI for software architecture and incorporating gray literature that prior SLRs omitted.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The complete search string is provided verbatim in Section 3.2.1 with all Boolean operators, wildcards, and term variants; databases are listed; a replication package with raw data is hosted on Zenodo (doi:10.5281/zenodo.15032395).",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Table 2 lists one inclusion criterion (I1) and seven exclusion criteria (E1-E7) with the screening step at which each applies (title/abstract, full text, or both), consistently applied across both white and gray literature.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper follows the MLR guidelines of Garousi et al. [5] and Kitchenham and Charters [20], with a documented multi-stage workflow illustrated in Figure 1 covering all standard systematic review phases.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The full search string is displayed in a formatted box in Section 3.2.1, including all GenAI synonyms (gen AI, gen-AI, genAI, LLM, GPT*, Claude*, etc.) and software architecture variants with wildcards.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "White literature: ACM Digital Library, IEEEXplore, Scopus, Web of Science; gray literature: Google, Google Scholar, Bing — all explicitly named in Section 3.2.2.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Table 5 provides counts at each stage for both white (621→45→27+11→36, -2 quality) and gray (433→77→5+3→10) literature, with inter-rater Cohen's k coefficients reported at each stage (k=0.71, 0.64 for white; k=0.81, 0.88 for gray).",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The time boundary starting March 2022 is explicitly justified by the GPT-3.5 public release; database choices are justified by Kitchenham's recommendations; the MLR approach including gray literature is justified by the rapid industry adoption outpacing academic publication.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Section 5.4 explicitly contrasts white and gray literature findings (white: formal empirical rigor, academic depth; gray: anecdotal evidence, productivity focus, less critical stance), and the introduction acknowledges 'inconsistent findings regarding the reliability of GenAI for architectural decisions.'",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Quality assessment is applied to white literature via an 11-question Likert checklist (Table 3, from Dyba and Dingsøyr) and to gray literature via a multi-criteria rubric (Table 4, from Garousi et al.); two white papers were excluded for failing the quality threshold.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The paper never discusses publication bias; it does not acknowledge that published papers about GenAI for software architecture may skew positive or that negative/null results are underrepresented in the reviewed corpus.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The paper systematically reports frequency counts and percentages across all findings in multiple structured tables (e.g., 62% GPT use, 38% architectural decision support, 93% lacking validation output); quantitative vote-counting synthesis is pervasive throughout the results section.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Section 5 derives implications from specific identified findings with citations to primary studies — e.g., validation frameworks are recommended because 93% of studies lack them; hallucination handling is recommended because five specific studies flag it as a critical issue.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "OpenAI GPT models dominate GenAI for software architecture research, used in 62% of studies",
    201       "evidence": "Table 12 counts 105 model references to OpenAI variants across 39 papers; confirmed in RQ1.2 summary stating 'OpenAI GPT models are the ones that rule the roost'",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Architectural decision support is the most frequently investigated purpose, appearing in 38% of reviewed papers",
    206       "evidence": "Table 11 maps 18 papers to 'Architectural Decision Support' purpose; confirmed in RQ1.1 summary",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "93% of studies do not report any evaluation strategy for AI-generated architectural outputs",
    211       "evidence": "Table 18 shows 43 of 46 papers marked 'Unspecified' for validation method; only ATAM (1 paper), SAAM (1 paper), and static analysis (1 paper) are reported",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Requirements-to-Architecture is the most targeted SALC phase, appearing in 40% of papers",
    216       "evidence": "Table 14 shows 24 papers in Req-to-Arch category; stated explicitly in RQ2.1 summary",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Few-shot prompting is the most common technique, present in 31% of studies",
    221       "evidence": "Table 13 shows 16 papers using few-shot prompting; confirmed in RQ1.3 summary",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "85% of studies involve some form of human interaction with the model",
    226       "evidence": "Table 13 shows 39 of 46 papers report human-model interaction; stated explicitly in RQ1.3 summary",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "GenAI shows significant potential in software architecture but rigorous evaluation is largely absent",
    231       "evidence": "Supported by the 93% validation gap finding; however the positive potential claim is an editorial judgment rather than an empirically measured outcome, making it partially supported",
    232       "supported": "moderate"
    233     }
    234   ],
    235   "methodology_tags": [
    236     "qualitative",
    237     "meta-analysis"
    238   ],
    239   "key_findings": "This MLR of 46 papers (36 academic, 10 gray) finds that GenAI for software architecture is dominated by OpenAI GPT models (62%) and primarily targets architectural decision support (38%) and Requirements-to-Architecture transitions (40%). Despite active adoption, 93% of studies report no formal evaluation strategy for AI-generated architectural outputs, revealing a critical absence of rigorous testing. Key open challenges include LLM accuracy, hallucinations, ethics and privacy concerns, and the lack of architecture-specific datasets and standardized evaluation frameworks needed for systematic progress in the field.",
    240   "red_flags": [
    241     {
    242       "flag": "Small corpus, broad conclusions",
    243       "detail": "Only 46 papers (36 white, 10 gray) from 1054 retrieved (~4.4% inclusion rate) is a thin basis for broad conclusions about the 'state of the field' in a rapidly evolving domain."
    244     },
    245     {
    246       "flag": "Optimistic conclusion unsupported by evidence",
    247       "detail": "The conclusion states 'It is optimistic that LLM...can continue to contribute to this domain with even more astonishing outcomes' — this goes well beyond what a descriptive frequency survey can support and contradicts the paper's own finding that rigorous evaluation is largely absent."
    248     },
    249     {
    250       "flag": "Publication bias not discussed",
    251       "detail": "The paper does not acknowledge that published papers about GenAI likely skew positive, potentially inflating apparent adoption rates and success narratives in the reviewed corpus."
    252     },
    253     {
    254       "flag": "Gray literature heterogeneity",
    255       "detail": "Gray literature includes YouTube videos, blog posts, and press releases alongside more credible industry reports; mixing these sources with peer-reviewed work may distort the overall picture despite quality criteria being applied."
    256     },
    257     {
    258       "flag": "Boilerplate threats section",
    259       "detail": "The threats to validity section is largely formulaic, with internal and external validity subsections providing no specific quantified risks or concrete examples of what might be wrong."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Large language models for software engineering: Survey and open problems (Fan et al., ICSE-FoSE 2023)",
    265       "relevance": "Foundational survey establishing that LLMs in SE are concentrated on code generation with limited coverage of architecture/design — directly motivates this MLR"
    266     },
    267     {
    268       "title": "Large language models for software engineering: A systematic literature review (Hou et al., ACM TOSEM 2024)",
    269       "relevance": "Prior SLR of 395 papers finding only 4 relevant to software design, quantifying the gap this MLR addresses"
    270     },
    271     {
    272       "title": "A survey on large language models for code generation (Jiang et al., arXiv 2024)",
    273       "relevance": "Related SLR of 235 papers on LLMs for code generation, contextualizing architecture-to-code as a subset"
    274     },
    275     {
    276       "title": "Software testing with large language models: Survey, landscape, and vision (Wang et al., IEEE TSE 2024)",
    277       "relevance": "Companion SLR on LLMs for testing, part of the landscape showing architecture is underserved"
    278     },
    279     {
    280       "title": "Artificial intelligence for software architecture: Literature review and the road ahead (Bucaioni et al., 2025)",
    281       "relevance": "Direct parallel work on AI for software architecture published concurrently, compared in Table 1"
    282     },
    283     {
    284       "title": "Software architecture meets LLMs: A systematic literature review (Schmid et al., 2025)",
    285       "relevance": "Very closely related SLR on LLMs for software architecture (18 papers), identified as prior work this MLR extends with broader scope and gray literature"
    286     },
    287     {
    288       "title": "Guidelines for including grey literature and conducting multivocal literature reviews in software engineering (Garousi et al., IST 2019)",
    289       "relevance": "Methodological foundation for the MLR approach, quality assessment criteria for gray literature (Table 4)"
    290     },
    291     {
    292       "title": "Guidelines for performing systematic literature reviews in software engineering (Kitchenham & Charters, 2007)",
    293       "relevance": "Core methodology guidelines for database selection, search strategy, and screening process used throughout"
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 1,
    299       "justification": "Provides a taxonomy and research roadmap but no immediately usable tools or techniques for practitioners."
    300     },
    301     "surprise_contrarian": {
    302       "score": 1,
    303       "justification": "The finding that 93% of studies lack GenAI output validation is noteworthy but the overall message — that GenAI in architecture is immature — largely confirms expectations."
    304     },
    305     "fear_safety": {
    306       "score": 0,
    307       "justification": "No novel AI risk or security concerns are raised; challenges mentioned (hallucinations, accuracy) are well-known."
    308     },
    309     "drama_conflict": {
    310       "score": 0,
    311       "justification": "No controversy or provocative claims; the paper is a neutral synthesis of existing literature."
    312     },
    313     "demo_ability": {
    314       "score": 0,
    315       "justification": "No code, tool, or demo is provided — only a Zenodo data archive."
    316     },
    317     "brand_recognition": {
    318       "score": 1,
    319       "justification": "Authors from University of Oulu and University of Arizona; not famous AI labs but established SE research groups. Published in Journal of Systems and Software."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "30898271",
    326         "title": "Visualizing quantum mechanics in an interactive simulation",
    327         "points": 3,
    328         "comments": 0,
    329         "url": "https://news.ycombinator.com/item?id=30898271"
    330       },
    331       {
    332         "hn_id": "22752977",
    333         "title": "Agent57: Outperforming the Atari Human Benchmark",
    334         "points": 3,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=22752977"
    337       },
    338       {
    339         "hn_id": "43516923",
    340         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    341         "points": 2,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=43516923"
    344       },
    345       {
    346         "hn_id": "43496516",
    347         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    348         "points": 2,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=43496516"
    351       },
    352       {
    353         "hn_id": "43759600",
    354         "title": "In between myth and reality: AI for math – a case study in category theory",
    355         "points": 1,
    356         "comments": 0,
    357         "url": "https://news.ycombinator.com/item?id=43759600"
    358       },
    359       {
    360         "hn_id": "30835084",
    361         "title": "Visualizing quantum mechanics in an interactive simulation",
    362         "points": 1,
    363         "comments": 0,
    364         "url": "https://news.ycombinator.com/item?id=30835084"
    365       },
    366       {
    367         "hn_id": "23958170",
    368         "title": "Agent57: Outperforming the Atari Human Benchmark",
    369         "points": 1,
    370         "comments": 0,
    371         "url": "https://news.ycombinator.com/item?id=23958170"
    372       }
    373     ],
    374     "top_points": 3,
    375     "total_points": 13,
    376     "total_comments": 0
    377   }
    378 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs