scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25838B)
      1 {
      2   "paper": {
      3     "title": "The Role of Generative AI in Strengthening Secure Software Coding Practices: A Systematic Perspective",
      4     "authors": ["Hathal S. Alwageed", "Rafiq Ahmad Khan"],
      5     "year": 2025,
      6     "venue": "EASE Companion",
      7     "arxiv_id": "2504.19461",
      8     "doi": "10.1145/3727967.3756840"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This SLR identifies 109 GenAI practices for addressing cybersecurity risks in software coding, organized across 11 risk categories (insecure coding, vulnerable dependencies, poor error handling, weak authentication, misconfigured security controls, inadequate encryption, XSS, insufficient logging, race conditions, inadequate testing, supply chain attacks). The paper reports that automated vulnerability detection accounts for 27% of GenAI's role, followed by secure code generation at 23%. However, many listed 'GenAI practices' are standard software engineering practices (e.g., thread synchronization, output encoding) rather than genuinely AI-driven techniques, and no empirical validation of the taxonomy is provided.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No analysis code, scripts, or repository links are provided anywhere in the paper. A survey can release its extraction scripts and analysis code."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The list of 31 final papers is not provided as a downloadable dataset. Table 4 lists extracted practices but the underlying data extraction spreadsheet or corpus is not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No computational environment is specified. While this is a survey, any analysis tooling or scripts used for data extraction could have been documented."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. While the search string and databases are listed, a reader would need to guess many steps to reproduce the review."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a systematic literature review that does not run experiments or produce statistical results requiring confidence intervals."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No experimental comparisons are made. The paper is a survey that categorizes practices from literature."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No experiments are conducted, so effect sizes are not applicable."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No experimental sample sizes apply. The number of reviewed papers (31) is an output of the search process, not an experimental design choice requiring power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs are conducted. This is a literature review."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The survey does not compare its findings against prior surveys or systematic reviews on the same topic. No reference to existing surveys on GenAI for secure coding is made as a baseline for comparison."
     69       },
     70       "baselines_contemporary": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "No experimental baselines are applicable to this survey paper."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No system with components to ablate. This is a systematic literature review."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No experimental evaluation with metrics is conducted."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs to evaluate. This is a literature review."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No test set is applicable to this survey paper."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 4 breaks down 109 GenAI practices across 11 cybersecurity risk categories (insecure coding, vulnerable dependencies, poor error handling, etc.). Figure 2 provides a percentage breakdown of GenAI roles across 6 application areas."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No discussion of where GenAI practices fail, cases where they are ineffective, or limitations of specific practices. All 109 practices are presented positively without failure analysis."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. Every finding is positive about GenAI's role in secure coding. No practices that were found to be ineffective or counterproductive are mentioned."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims analysis via 'empirical studies on how these tools help to mitigate security risks' but no original empirical studies are conducted — the paper is purely an SLR. The abstract also claims findings will 'benefit researchers, software engineers and cybersecurity professionals alike in integrating GenAI into a secure development workflow' but the taxonomy lacks empirical validation."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Section 3 opens with 'Generative AI (GenAI) significantly contributes to detecting and preventing cybersecurity threats' and Section 5 states 'GenAI has proven effective in identifying vulnerabilities early.' These are causal claims presented without original empirical evidence. The claims are derived from reviewed literature but without quality assessment of the source studies."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "Claims like 'GenAI has proven effective in identifying vulnerabilities early in the software development lifecycle' (Section 5) are unbounded. The paper doesn't specify which GenAI tools, which types of vulnerabilities, or which development contexts. The title itself — 'Strengthening Secure Software Coding Practices' — implies general applicability without bounds."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Section 4 discusses validity types (external, internal, construct) but does not consider alternative explanations for the findings. For example, no discussion of whether the observed GenAI practices might be repackaged traditional practices, or whether publication bias in the source studies inflates GenAI's perceived effectiveness."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": false,
    134         "answer": false,
    135         "justification": "This is a survey paper that does not make direct measurements. No proxy-outcome gap exists to discuss."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models are used in the methodology. This is a manual systematic literature review."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used. This is a manual literature review."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No AI models are run, so no hyperparameters apply."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used in this survey."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Table 1 provides paper counts at each stage per database (2694 initial → 220 initial selection → 31 final). Table 2 provides inclusion and exclusion criteria (IC1-IC5, EC1-EC5). Figure 1 shows the SLR stage-steps. The search string is provided verbatim."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 4 is titled 'Limitations of the Study' and discusses external validity, internal validity, and construct validity across several paragraphs."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 4 discusses three validity types but in generic terms. External validity says 'The general application scope for particular industries together with emerging technologies demands additional specific studies.' Internal validity claims 'a systematic method of paper selection that uses transparent criteria' without identifying specific threats. These are boilerplate discussions of validity categories, not specific threats to this particular study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. Section 4 makes general claims about generalizability but doesn't bound specific scope limitations (e.g., which programming languages, which types of GenAI tools, which development contexts are excluded)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The extracted data from the 31 reviewed papers is not available for independent verification. Table 4 summarizes practices but the underlying extraction is not released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2 describes the search strategy: databases used (IEEE Xplore, ScienceDirect, ACM, SpringerLink, Google Scholar), the search string, inclusion/exclusion criteria (Table 2), and quality assessment criteria (Table 3). Table 1 shows results per database."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The data source is literature from academic databases."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Table 1 documents the pipeline from initial search results (2694) to initial selection (220) to final selection (31) across five databases. Table 2 provides the filtering criteria. Figure 1 diagrams the SLR stages."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Hathal S. Alwageed at Jouf University (Saudi Arabia) and Rafiq Ahmad Khan at University of Malakand (Pakistan). Neither institution has a product being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding source is disclosed, making it impossible to assess funder independence. Absence of disclosure is treated as NO."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is a systematic literature review that does not evaluate any pre-trained model on benchmarks."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model evaluation is conducted. This is a literature review."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is conducted. This is a literature review."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this systematic literature review."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved. (Paper inclusion/exclusion criteria are covered under data_integrity.)"
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants or experimental conditions."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants or experimental conditions."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a survey paper with no computational method of its own."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a survey paper with no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "The paper follows Kitchenham's SLR guidelines [17], uses a structured search string, defines inclusion/exclusion criteria (Table 2), provides a stage diagram (Figure 1), and documents paper counts at each stage (Table 1). While not a PRISMA flow diagram, it follows an established systematic review protocol."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Table 3 lists 6 quality assessment criteria (e.g., 'Relevance of the research problem', 'Methodological Rigor') but they are just labels with no scoring rubric, no operationalization, and no reported scores. The paper explicitly cites Petersen et al. [21] to justify 'performing a quality assessment without setting strict criteria.' The 109 practices in Table 4 are presented without any evaluation of the methodological quality of their source studies."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of publication bias anywhere in the paper. No funnel plots, no discussion of whether reviewed papers skew toward positive results about GenAI, no acknowledgment that the literature may overrepresent successful GenAI applications."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "GenAI significantly contributes to detecting and preventing cybersecurity threats in secure software coding.",
    313       "evidence": "Section 3 opening statement, supported by Table 4 listing 109 practices extracted from 31 reviewed papers. No original empirical validation provided.",
    314       "supported": "weak"
    315     },
    316     {
    317       "claim": "Automated Vulnerability Detection and Mitigation accounts for the largest proportion (27%) of GenAI's role in addressing cybersecurity risks.",
    318       "evidence": "Figure 2 shows a percentage breakdown across 6 application areas. The methodology for computing these percentages is not explained.",
    319       "supported": "weak"
    320     },
    321     {
    322       "claim": "Secure Code Generation and Assistance represents 23% of GenAI's contributions to cybersecurity.",
    323       "evidence": "Figure 2. No methodology for computing the percentage is described.",
    324       "supported": "weak"
    325     },
    326     {
    327       "claim": "GenAI has proven effective in identifying vulnerabilities early in the software development lifecycle, preventing threats before they reach the final product.",
    328       "evidence": "Section 5 conclusion. Claim is derived from the literature review but no original empirical evidence is presented, and reviewed studies are not quality-assessed.",
    329       "supported": "unsupported"
    330     },
    331     {
    332       "claim": "Organizations following the combination of AI frameworks and maturity models achieve 30% higher efficiency in adoption of AI projects.",
    333       "evidence": "Section 1, attributed to McKinsey [16]. This is a second-hand citation from a consultancy report, not validated by this study.",
    334       "supported": "weak"
    335     }
    336   ],
    337   "red_flags": [
    338     {
    339       "flag": "Many 'GenAI practices' are standard engineering practices",
    340       "detail": "Table 4 lists items like 'Thread Synchronization', 'Avoid Shared State', 'Output Encoding', 'Use of Strong Encryption Algorithms', 'Atomic Operations', and 'Formal Verification' as 'GenAI Practices.' These are well-established software engineering practices that predate GenAI by decades. Labeling them as GenAI practices inflates the apparent contribution of GenAI."
    341     },
    342     {
    343       "flag": "No quality assessment of source studies despite claiming one",
    344       "detail": "Table 3 lists quality assessment criteria but the paper explicitly cites Petersen et al. to avoid applying strict criteria. No quality scores are reported for any of the 31 included papers. This means the 109 practices are extracted without evaluating whether they come from rigorous or weak studies, effectively laundering the signal-to-noise ratio."
    345     },
    346     {
    347       "flag": "Contradictory inclusion/exclusion criteria",
    348       "detail": "IC4 includes papers 'discussing some AI approaches, models, algorithms... for cybersecurity in software coding' while EC4 excludes papers 'proposing AI approach, model, algorithm... for cybersecurity in software coding.' The distinction between 'discussing' and 'proposing' is unclear and could lead to arbitrary inclusion decisions."
    349     },
    350     {
    351       "flag": "Unexplained percentage methodology",
    352       "detail": "Figure 2 presents a precise breakdown (27%, 23%, 17%, 14%, 12%, 7%) of GenAI's roles but provides no explanation of how these percentages were calculated — whether from paper counts, practice counts, citation frequency, or author judgment."
    353     },
    354     {
    355       "flag": "Abstract overpromises empirical content",
    356       "detail": "The abstract states the paper analyzes via 'empirical studies on how these tools help to mitigate security risks' but no original empirical studies are conducted. The paper is entirely a literature review."
    357     },
    358     {
    359       "flag": "Uncritical positive framing",
    360       "detail": "All 109 practices and all findings are presented positively. No discussion of GenAI failures in security contexts, risks of GenAI-generated insecure code, or limitations of specific practices. The conclusion states 'GenAI has proven effective' without qualification."
    361     },
    362     {
    363       "flag": "Reference numbering inconsistencies",
    364       "detail": "Several reference numbers in the introduction text appear to mismatch their cited content (e.g., [9] is cited for trustworthiness models by Medeiros et al. in the introduction but reference [9] lists Ulfsnes et al. on Transforming Software Development). This suggests citation management issues."
    365     }
    366   ],
    367   "cited_papers": [
    368     {
    369       "title": "Generative Artificial Intelligence for Software Security Analysis: Fundamentals, Applications, and Challenges",
    370       "authors": ["A. Ding", "G. Li", "X. Yi", "X. Lin", "J. Li", "C. Zhang"],
    371       "year": 2024,
    372       "relevance": "Directly addresses GenAI for software security analysis including fundamentals, applications, and challenges."
    373     },
    374     {
    375       "title": "Generative AI in Cybersecurity: A Comprehensive Review of LLM Applications and Vulnerabilities",
    376       "authors": ["M. A. Ferrag", "F. Alwahedi", "A. A. Battah", "B. Cherif", "A. Mechri", "N. Tihanyi"],
    377       "year": 2024,
    378       "relevance": "Comprehensive review of LLM applications and vulnerabilities in cybersecurity, directly relevant to AI security evaluation."
    379     },
    380     {
    381       "title": "Optimizing Secure AI Lifecycle Model Management With Innovative Generative AI Strategies",
    382       "authors": ["A. O. Almagrabi", "R. A. Khan"],
    383       "year": 2025,
    384       "relevance": "Addresses secure AI lifecycle management with GenAI strategies, relevant to AI-assisted software security."
    385     },
    386     {
    387       "title": "Generative AI for software practitioners",
    388       "authors": ["C. Ebert", "P. Louridas"],
    389       "year": 2023,
    390       "relevance": "Examines GenAI from a software practitioner perspective, relevant to practical AI-assisted development."
    391     },
    392     {
    393       "title": "Generative AI for Cyber Security: Analyzing the Potential of ChatGPT, DALL-E and Other Models for Enhancing the Security Space",
    394       "authors": ["S. Sai", "U. Yashvardhan", "V. Chamola", "B. Sikdar"],
    395       "year": 2024,
    396       "relevance": "Analyzes potential of specific GenAI models (ChatGPT, DALL-E) for cybersecurity applications."
    397     },
    398     {
    399       "title": "Harnessing GPT-4 for generation of cybersecurity GRC policies: A focus on ransomware attack mitigation",
    400       "authors": ["T. McIntosh", "T. Liu", "T. Susnjak", "H. Alavizadeh", "A. Ng", "R. Nowrozy"],
    401       "year": 2023,
    402       "relevance": "Empirical study using GPT-4 for cybersecurity policy generation, directly testing LLM capability in security domain."
    403     },
    404     {
    405       "title": "Machine learning techniques for IoT security: Current research and future vision with generative AI and large language models",
    406       "authors": ["F. Alwahedi", "A. Aldhaheri", "M. A. Ferrag", "A. Battah", "N. Tihanyi"],
    407       "year": 2024,
    408       "relevance": "Reviews ML and GenAI techniques for IoT security, relevant to AI-assisted security across software domains."
    409     },
    410     {
    411       "title": "Accelerating Software Quality: Unleashing the Power of Generative AI for Automated Test-Case Generation and Bug Identification",
    412       "authors": ["Y. Bajaj", "M. Samal"],
    413       "year": 2023,
    414       "relevance": "Examines GenAI for automated test generation and bug identification, relevant to AI-assisted code quality."
    415     },
    416     {
    417       "title": "Transforming Software Development with Generative AI: Empirical Insights on Collaboration and Workflow",
    418       "authors": ["R. Ulfsnes", "N. Moe", "V. Stray", "M. Skarpen"],
    419       "year": 2024,
    420       "relevance": "Empirical study on GenAI's impact on software development collaboration and workflow."
    421     },
    422     {
    423       "title": "Systematic Literature Review on Security Risks and its Practices in Secure Software Development",
    424       "authors": ["R. A. Khan", "S. U. Khan", "H. U. Khan", "M. Ilyas"],
    425       "year": 2022,
    426       "relevance": "Prior SLR on security risks in software development, serves as methodological precedent for this work."
    427     },
    428     {
    429       "title": "Generative AI-Enhanced Cybersecurity Framework for Enterprise Data Privacy Management",
    430       "authors": ["G. S. Nadella", "S. R. Addula", "A. R. Yadulla", "G. S. Sajja", "M. Meesala", "M. H. Maturi"],
    431       "year": 2025,
    432       "relevance": "Proposes a GenAI-enhanced cybersecurity framework relevant to enterprise software security practices."
    433     }
    434   ],
    435   "engagement_factors": {
    436     "practical_relevance": {
    437       "score": 1,
    438       "justification": "The taxonomy of 109 practices could serve as a reference checklist, but many items are generic and lack actionable detail for practitioners."
    439     },
    440     "surprise_contrarian": {
    441       "score": 0,
    442       "justification": "Findings confirm the expected narrative that GenAI can help with software security; no surprising or contrarian conclusions."
    443     },
    444     "fear_safety": {
    445       "score": 1,
    446       "justification": "Discusses cybersecurity risks in software development but does not reveal novel attack vectors or raise new safety concerns."
    447     },
    448     "drama_conflict": {
    449       "score": 0,
    450       "justification": "No controversy, no challenge to existing claims or institutions."
    451     },
    452     "demo_ability": {
    453       "score": 0,
    454       "justification": "No code, tool, or demo is provided. The output is a literature-derived taxonomy."
    455     },
    456     "brand_recognition": {
    457       "score": 0,
    458       "justification": "Authors from Jouf University and University of Malakand; no major AI lab or well-known brand association."
    459     }
    460   }
    461 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs