scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19890B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Generative AI for Software Architecture. Applications, Trends, Challenges, and Future Directions",
      6     "authors": [
      7       "Matteo Esposito",
      8       "Xiaozhou Li",
      9       "Sergio Moreschini",
     10       "Noman Ahmad",
     11       "Tomás Cerný"
     12     ],
     13     "year": 2025,
     14     "venue": "Journal of Systems and Software",
     15     "arxiv_id": "2503.13310",
     16     "doi": "10.48550/arXiv.2503.13310"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims map directly to findings sections: 38% architectural decision support (RQ1.1), 62% GPT dominance (RQ1.2), Req-to-Arch as top SALC phase (RQ2.1), and 93% lacking validation (Table 18).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a descriptive MLR cataloging what exists in the literature; no causal claims are made.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Results sections consistently report frequency counts over the 46 reviewed papers ('38% of the works...') rather than universal claims; the search cutoff (February 2025) and source scope are explicit.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider alternative explanations for key patterns (e.g., GPT dominance could reflect availability bias or search engine prevalence rather than researcher preference); white vs. gray differences are noted but alternative interpretations of findings are absent.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes frequency of adoption from effectiveness: it notes that '93% of studies do not report any evaluation strategy, indicating a lack of systematic validation' and refrains from treating usage frequency as evidence of utility.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats to Validity' covers construct, internal, external, and conclusion validity in dedicated subsections.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific inter-rater agreement coefficients are reported (k=71% white T/A, k=64% white full text, k=81% gray T/A, k=88% gray full text); mitigation strategies (third-author arbitration, quality assessment checklists) are described concretely.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The temporal scope (post March 2022) and database scope are explicit, but the paper does not state what the results do NOT demonstrate—e.g., no explicit disclaimer that adoption frequency cannot be interpreted as evidence of GenAI effectiveness in SA.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgment section discloses Business Finland Project 6GSoft, Academy of Finland project MUFANO/349488, and NSF Grant No. 2409933.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All six authors have affiliations listed on the title page: University of Oulu, Tampere University, University of Arizona, and IIIT Hyderabad.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Business Finland, Academy of Finland, and NSF are government funding bodies with no commercial stake in GenAI tool adoption outcomes.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no 'competing interests' or 'declaration of financial interests' statement anywhere in the paper; only funding acknowledgment is present.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Generative AI, MLR (with reference to Garousi et al.'s guidelines), SALC (Software Architecture Life Cycle), and white vs. gray literature distinctions are all defined in context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Introduction lists five explicit bullet-point contributions: synthesis, model classification, identification of applications/challenges, research gaps, and industry relevance via gray literature.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 systematically compares nine prior systematic studies, explicitly showing how this MLR differs (focus on SA specifically, MLR vs. SLR, inclusion of gray literature); Section 2 discusses each prior work's findings and gaps.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The full search string is provided verbatim with Boolean operators; databases and search engines are named; temporal scope (until February 2025) and platforms (ACM, IEEEXplore, Scopus, Web of Science, Google, Google Scholar, Bing) are explicit.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Table 2 lists seven criteria (I1, E1–E7) with the stage of application (T/A, F, or All) and rationale for each; a test phase using 10 random papers preceded full application.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The study follows the MLR protocol from Garousi et al. [5] 'Guidelines for including grey literature and conducting multivocal literature reviews in software engineering,' which is a recognized structured protocol for this study type.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The complete search string with all OR/AND terms and wildcards is shown in a dedicated box in Section 3.2.1.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "White literature: ACM Digital Library, IEEEXplore, Scopus, Web of Science. Gray literature: Google, Google Scholar, Bing. All listed explicitly in Section 3.2.2.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Table 5 shows counts at each screening stage for both white (621 → 45 → 27 → 38 → 36) and gray (433 → 77 → 5 → 8 → 10) literature streams with reasons for exclusions.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The introduction provides six specific justifications for why this review is needed now, including the temporal scope anchored to GPT-3.5's public release (March 2022) as the start of the relevant era.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Section 5.4 compares white vs. gray literature perspectives but does not acknowledge conflicting empirical findings within the primary studies themselves (e.g., whether RAG is helpful vs. not across studies).",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Table 3 provides an 11-question quality checklist for white literature (5-point Likert scale, threshold ≥2) and Table 4 provides a separate multi-criteria quality rubric for gray literature; 2 white and some gray papers were excluded based on quality.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "There is no discussion of whether the literature skews positive about GenAI in SA due to publication bias; given this is a rapidly hyped field, this is a significant omission.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The paper provides systematic frequency counts and percentages across all RQs (e.g., 38% architectural decision support, 62% GPT, 40% Req-to-Arch, 93% no validation), constituting structured vote counting across 46 papers.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Section 5 implications are explicitly tied to challenges extracted from reviewed papers (e.g., the call for standardized evaluation metrics follows directly from 93% of studies lacking validation); recommendations do not go significantly beyond reviewed evidence.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "OpenAI GPT models dominate the GenAI-in-SA research landscape at 62% of all model usage instances.",
    201       "evidence": "Table 12 shows GPT family variants appear in 105 of 46 papers (counting multi-model papers), at 62% family share.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Architectural decision support is the most frequent purpose for using GenAI in software architecture (38% of papers).",
    206       "evidence": "Table 11 shows 18 of 46 papers target architectural decision support; second place is reverse engineering at 19%.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Few-shot prompting is the most common prompt engineering technique (31% of papers).",
    211       "evidence": "Table 13 reports 16 papers use few-shot prompting; next is zero-shot at 12%.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "93% of reviewed studies do not report any strategy for validating AI-generated architectural outputs.",
    216       "evidence": "Table 18 shows 43 of 46 studies listed as 'Unspecified' for architecture analysis/validation method; only ATAM, SAAM, and static analysis appear once each.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "85% of studies involve some form of human interaction with GenAI models, indicating assistive rather than autonomous use.",
    221       "evidence": "Table 13 shows 39 of 46 papers report human interaction; only 7 (15%) operate without direct human intervention.",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "GenAI is predominantly applied to the early phases of the SALC: Requirements-to-Architecture (40%) and Architecture-to-Code (32%).",
    226       "evidence": "Table 14 documents 24 papers for Req-to-Arch and 19 for Arch-to-Code; Architecture-to-Architecture is only 3% (2 papers).",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "68% of reviewed papers do not specify the architectural style or pattern targeted.",
    231       "evidence": "Table 15 shows 32 of 46 studies as 'Unspecified' for architectural style; monolithic leads among specified styles at 15%.",
    232       "supported": "strong"
    233     }
    234   ],
    235   "methodology_tags": [
    236     "meta-analysis",
    237     "qualitative"
    238   ],
    239   "key_findings": "This MLR of 46 papers (36 peer-reviewed, 10 gray literature) finds that GenAI in software architecture is dominated by OpenAI GPT models (62%), primarily applied to architectural decision support (38%) and reverse engineering (19%), with few-shot prompting as the leading technique. The field is concentrated in early SALC phases—Requirements-to-Architecture (40%) and Architecture-to-Code (32%)—and 93% of primary studies provide no validation methodology for AI-generated outputs, indicating a critical absence of rigorous evaluation. Primary challenges identified across the corpus are LLM accuracy (15%), privacy (7%), hallucinations (8%), and ethical concerns (7%), with a clear gap in architecture-specific datasets and standardized benchmarks.",
    240   "red_flags": [
    241     {
    242       "flag": "Small primary corpus",
    243       "detail": "Only 46 papers (36 white, 10 gray) are synthesized to characterize the 'landscape' of a rapidly growing field; this is a thin basis for trend claims."
    244     },
    245     {
    246       "flag": "Publication bias not discussed",
    247       "detail": "No mention of the well-documented bias toward positive GenAI results in published literature; the entire synthesis could be skewed by papers not reporting negative findings."
    248     },
    249     {
    250       "flag": "Primary studies themselves lack rigor",
    251       "detail": "93% of primary studies do not validate their GenAI outputs; synthesizing these studies produces findings about what researchers do, not whether GenAI works—a meta-level limitation not foregrounded."
    252     },
    253     {
    254       "flag": "Gray literature quality heterogeneity",
    255       "detail": "Gray literature includes YouTube videos (GL[3]) and blog posts (GL[2], GL[6], GL[7], GL[9]) alongside industry reports; despite quality screening, these sources introduce high anecdotal content."
    256     },
    257     {
    258       "flag": "AI used in writing without bias analysis",
    259       "detail": "Authors disclose ChatGPT was used to improve language and readability but do not discuss whether this introduces systematic framing bias in a paper about AI capabilities."
    260     },
    261     {
    262       "flag": "No competing interests statement",
    263       "detail": "Multiple authors are from institutions with active GenAI research programs and one author is from the Software Engineering Research Center (IIIT Hyderabad); no competing interests declaration is provided."
    264     }
    265   ],
    266   "cited_papers": [
    267     {
    268       "title": "Large language models for software engineering: Survey and open problems",
    269       "relevance": "Prior survey of LLMs across the full SE lifecycle; this paper explicitly situates itself as filling the gap in SA coverage identified by Fan et al."
    270     },
    271     {
    272       "title": "Large language models for software engineering: A systematic literature review",
    273       "relevance": "SLR of 395 papers on LLMs in SE; found only 4 relevant to software design, motivating the need for a dedicated MLR on SA."
    274     },
    275     {
    276       "title": "Guidelines for including grey literature and conducting multivocal literature reviews in software engineering",
    277       "relevance": "Methodological foundation for the MLR protocol; the paper's gray literature quality criteria (Table 4) are directly derived from Garousi et al."
    278     },
    279     {
    280       "title": "Software architecture meets LLMs: A systematic literature review",
    281       "relevance": "Closest prior SLR in the same topic area (18 papers); this MLR directly extends and supersedes it with a larger, multivocal corpus."
    282     },
    283     {
    284       "title": "Artificial intelligence for software architecture: Literature review and the road ahead",
    285       "relevance": "Complementary SLR on broader AI (not just GenAI) for SA; used to frame the gap in GenAI-specific coverage."
    286     },
    287     {
    288       "title": "A survey on large language models for code generation",
    289       "relevance": "Prior survey establishing the dominance of code generation in LLM/SE research; this paper argues software architecture is under-studied by comparison."
    290     },
    291     {
    292       "title": "Guidelines for performing systematic literature reviews in software engineering",
    293       "relevance": "Kitchenham & Charters guidelines used to select the bibliographic sources (ACM, IEEEXplore, Scopus, Web of Science)."
    294     },
    295     {
    296       "title": "Software testing with large language models: Survey, landscape, and vision",
    297       "relevance": "Parallel survey showing LLM application in testing; used to contextualize the relative neglect of architectural tasks."
    298     }
    299   ],
    300   "engagement_factors": {
    301     "practical_relevance": {
    302       "score": 2,
    303       "justification": "Software architects and researchers can use the taxonomy of applications, models, and challenges as a structured starting map for GenAI adoption in SA."
    304     },
    305     "surprise_contrarian": {
    306       "score": 1,
    307       "justification": "The finding that 93% of studies lack any output validation is striking, but the GPT dominance and human-in-the-loop prevalence are expected."
    308     },
    309     "fear_safety": {
    310       "score": 1,
    311       "justification": "Discusses ethical concerns, hallucinations, and privacy risks in AI-driven architectural decisions, but framing is academic rather than alarming."
    312     },
    313     "drama_conflict": {
    314       "score": 0,
    315       "justification": "No controversy or adversarial framing; the paper is a neutral cataloging exercise."
    316     },
    317     "demo_ability": {
    318       "score": 0,
    319       "justification": "The paper is a survey with a replication package on Zenodo; there is nothing interactive or demonstrable."
    320     },
    321     "brand_recognition": {
    322       "score": 0,
    323       "justification": "Authors are from University of Oulu, Tampere, Arizona, and IIIT Hyderabad—solid institutions but no famous lab recognition."
    324     }
    325   },
    326   "hn_data": {
    327     "threads": [
    328       {
    329         "hn_id": "30898271",
    330         "title": "Visualizing quantum mechanics in an interactive simulation",
    331         "points": 3,
    332         "comments": 0,
    333         "url": "https://news.ycombinator.com/item?id=30898271"
    334       },
    335       {
    336         "hn_id": "22752977",
    337         "title": "Agent57: Outperforming the Atari Human Benchmark",
    338         "points": 3,
    339         "comments": 0,
    340         "url": "https://news.ycombinator.com/item?id=22752977"
    341       },
    342       {
    343         "hn_id": "43516923",
    344         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    345         "points": 2,
    346         "comments": 0,
    347         "url": "https://news.ycombinator.com/item?id=43516923"
    348       },
    349       {
    350         "hn_id": "43496516",
    351         "title": "UniHOPE: A Unified Approach for Hand-Only and Hand-Object Pose Estimation",
    352         "points": 2,
    353         "comments": 0,
    354         "url": "https://news.ycombinator.com/item?id=43496516"
    355       },
    356       {
    357         "hn_id": "43759600",
    358         "title": "In between myth and reality: AI for math – a case study in category theory",
    359         "points": 1,
    360         "comments": 0,
    361         "url": "https://news.ycombinator.com/item?id=43759600"
    362       },
    363       {
    364         "hn_id": "30835084",
    365         "title": "Visualizing quantum mechanics in an interactive simulation",
    366         "points": 1,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=30835084"
    369       },
    370       {
    371         "hn_id": "23958170",
    372         "title": "Agent57: Outperforming the Atari Human Benchmark",
    373         "points": 1,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=23958170"
    376       }
    377     ],
    378     "top_points": 3,
    379     "total_points": 13,
    380     "total_comments": 0
    381   }
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs