scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23689B)
      1 {
      2   "paper": {
      3     "title": "Advancing Software Quality: A Standards-Focused Review of LLM-Based Assurance Techniques",
      4     "authors": [
      5       "Avinash Patil"
      6     ],
      7     "year": 2025,
      8     "venue": "arXiv preprint",
      9     "arxiv_id": "2505.13766"
     10   },
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": false,
     16         "justification": "No GitHub link, Zenodo archive, or any code repository URL is provided. This survey could have released analysis scripts or data extraction tools, but none are mentioned."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The survey's corpus of 223+ reviewed papers is not released as a structured dataset. No download link or supplementary data file is provided."
     22       },
     23       "environment_specified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "This is a literature survey with no computational experiments, so there is no software environment to specify."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step instructions are provided for how to replicate the literature search, paper selection, or analysis. A reader cannot reproduce the paper corpus without knowing what databases were searched and what queries were used."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "This is a descriptive literature survey, not a meta-analysis. It reports raw counts and percentages of papers but does not aggregate effect sizes or statistics that would require confidence intervals."
     39       },
     40       "significance_tests": {
     41         "applies": false,
     42         "answer": false,
     43         "justification": "No comparative statistical claims are made that require significance testing. The paper reports descriptive counts from the corpus."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "This is a descriptive survey with no meta-analytic aggregation, so effect sizes are not applicable."
     49       },
     50       "sample_size_justified": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "No experiments or statistical analyses were conducted; this is a literature survey."
     54       },
     55       "variance_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "No experimental runs were conducted; this is a literature survey reporting descriptive statistics."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The survey does not compare itself against prior surveys in the LLM-SQA space. No prior reviews are systematically compared to demonstrate additive value or methodological improvement."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Since baselines_included applies (prior surveys in LLM-SQA exist and could be compared against), the contemporariness of those baselines also applies. However, since the paper includes no baseline comparisons at all, it cannot be assessed whether baselines are contemporary. Answer is false because no baselines are used."
     71       },
     72       "ablation_study": {
     73         "applies": false,
     74         "answer": false,
     75         "justification": "This is a survey paper; ablation studies do not apply."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The survey characterizes the literature using multiple dimensions: publication trends, dataset utilization (Figure 2), evaluation approaches (Figure 3), fine-tuning adoption (Figure 4), LLM usage distribution (Figure 5), and prompting strategies (Figure 6)."
     81       },
     82       "human_evaluation": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Human evaluation of the survey's outputs (e.g., mapping quality, completeness of the standards analysis) is not conducted and is not clearly required for this type of review."
     86       },
     87       "held_out_test_set": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "This is a survey paper; there is no train/test split applicable."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Table I provides a detailed breakdown of LLM applications mapped to specific standards (ISO/IEC 12207, 25010, 5055, ISO 9001, CMMI, TMM) with per-category listings of applications and references."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section VI (Challenges, Limitations, and Risks) discusses failure modes and limitations of LLM-based SQA, including data privacy exposure, model bias, black-box explainability problems, and resource constraints."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section V.B notes that 'nearly 30% of the reviewed papers did not specify any dataset,' Section V.E notes that 'nearly 19% of papers did not specify which LLM was used,' and Section VI identifies challenges and failure patterns in the literature."
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract promises a survey of LLM-SQA intersection, discussion of challenges, and future directions. The paper delivers on these promises with sections covering each claim, including empirical case studies mentioned and standards mapping in Table I."
    113       },
    114       "causal_claims_justified": {
    115         "applies": false,
    116         "answer": false,
    117         "justification": "The paper makes no causal claims itself; it surveys and categorizes existing work without making causal assertions about what causes quality improvements."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper makes broad claims about LLM potential for SQA (e.g., 'LLMs can be integrated into SQA in a trustworthy, efficient, and standards-aligned manner') without adequately bounding these to the specific papers reviewed. The survey covers 2023-2025 literature yet presents conclusions that generalize broadly without acknowledging the novelty and instability of the field."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": false,
    126         "answer": false,
    127         "justification": "This is a descriptive survey that does not present original empirical results. The NA rule applies: the paper makes no empirical claims that would require alternative explanations."
    128       }
    129     },
    130     "setup_transparency": {
    131       "model_versions_specified": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "This is a survey paper that does not run experiments with LLMs, so model versions are not applicable."
    135       },
    136       "prompts_provided": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "No LLM prompting is used in this survey paper."
    140       },
    141       "hyperparameters_reported": {
    142         "applies": false,
    143         "answer": false,
    144         "justification": "No LLM experiments are conducted; this is a literature survey."
    145       },
    146       "scaffolding_described": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "No agentic scaffolding is used; this is a literature survey."
    150       },
    151       "data_preprocessing_documented": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Section IV describes 7 high-level selection criteria for including papers (focus on LLM-SQA, alignment with standards, empirical rigor, recency, breadth, practical relevance, peer review). However, it does not describe the actual search process: which databases were queried, what search strings were used, how many papers were initially found, or how many were excluded at each stage. 'Over 223 papers' are mentioned without explaining how the starting pool was identified or how it was filtered."
    155       }
    156     },
    157     "limitations_and_scope": {
    158       "limitations_section_present": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section VI is titled 'Challenges, Limitations, and Risks' but addresses limitations of LLM technology adoption in general, not limitations of this survey's methodology. There is no substantive discussion of the survey's own limitations (e.g., possible incomplete coverage, possible publication bias, lack of quality assessment of reviewed papers, single-author review without inter-rater reliability)."
    162       },
    163       "threats_to_validity_specific": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No threats to validity specific to this survey are discussed. The paper does not acknowledge threats such as: selection bias from the informal search process, the lack of formal systematic review protocol, single-author classification without reliability checks, or possible incomplete coverage of the 2023-2025 literature."
    167       },
    168       "scope_boundaries_stated": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The survey does not explicitly state what it does NOT cover. For example, it does not clarify whether non-English papers were excluded, which databases were searched, whether gray literature was included, or what LLM tasks are out of scope."
    172       }
    173     },
    174     "data_integrity": {
    175       "raw_data_available": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The full list of 223+ reviewed papers is not provided as a structured dataset. Readers cannot verify which papers were included in each category or reproduce the classification decisions."
    179       },
    180       "data_collection_described": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Section IV lists inclusion criteria but does not describe the actual data collection procedure: no search databases are named (e.g., IEEE Xplore, ACM DL, Semantic Scholar), no search queries are given, and no time period for the search itself is stated beyond 'papers published from 2023 onward.'"
    184       },
    185       "recruitment_methods_described": {
    186         "applies": false,
    187         "answer": false,
    188         "justification": "No human participants; this is a literature survey using published papers as data."
    189       },
    190       "data_pipeline_documented": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The pipeline from initial paper discovery to the final set of 223+ papers is not documented. There is no PRISMA-style flowchart or equivalent showing how many papers were initially found, how many were excluded at each stage, and why. The reader is left with 7 inclusion criteria but no evidence of how they were applied."
    194       }
    195     },
    196     "conflicts_of_interest": {
    197       "funding_disclosed": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No acknowledgments section or funding disclosure is present in the paper. The author's affiliation with Juniper Networks Inc. is listed but no statement about whether Juniper funded or supported this work is provided."
    201       },
    202       "affiliations_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The author's affiliation with Juniper Networks Inc. is disclosed on the title page alongside ORCID. The paper does not evaluate Juniper products specifically, so this affiliation does not constitute a direct conflict."
    206       },
    207       "funder_independent_of_outcome": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The author is affiliated with Juniper Networks Inc. but no funding source is disclosed. Absence of a funding disclosure is not evidence of being unfunded — the work could have been done on company time. Since we cannot verify funder independence, answer is false."
    211       },
    212       "financial_interests_declared": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No competing interests statement or declaration of financial interests is provided anywhere in the paper."
    216       }
    217     },
    218     "contamination": {
    219       "training_cutoff_stated": {
    220         "applies": false,
    221         "answer": false,
    222         "justification": "This is a literature survey that does not evaluate a pre-trained model on any benchmark. Contamination questions do not apply."
    223       },
    224       "train_test_overlap_discussed": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "No benchmark evaluation of pre-trained models is conducted in this paper."
    228       },
    229       "benchmark_contamination_addressed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "No benchmark evaluation of pre-trained models is conducted in this paper."
    233       }
    234     },
    235     "human_studies": {
    236       "pre_registered": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No human participants involved; this is a literature survey."
    240       },
    241       "irb_or_ethics_approval": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants involved; this is a literature survey."
    245       },
    246       "demographics_reported": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants involved; this is a literature survey."
    250       },
    251       "inclusion_exclusion_criteria": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants involved; this is a literature survey."
    255       },
    256       "randomization_described": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants involved; this is a literature survey."
    260       },
    261       "blinding_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants involved; this is a literature survey."
    265       },
    266       "attrition_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants involved; this is a literature survey."
    270       }
    271     },
    272     "cost_and_practicality": {
    273       "inference_cost_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is a survey paper with no original LLM experiments, so inference cost is not applicable."
    277       },
    278       "compute_budget_stated": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "This is a survey paper with no original computational experiments, so compute budget is not applicable."
    282       }
    283     }
    284   },
    285   "claims": [
    286     {
    287       "claim": "The number of publications on LLMs in SQA increased sharply after 2023: 31 papers in 2023, 91 in 2024, and 127 in 2025.",
    288       "evidence": "Section V.A, Figure 1. Counts are stated in the text.",
    289       "supported": "weak"
    290     },
    291     {
    292       "claim": "Nearly 30% of the reviewed papers did not specify any dataset, highlighting a concerning gap in reproducibility.",
    293       "evidence": "Section V.B, Figure 2. Percentage stated without showing the exact count or how papers were classified.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "GPT-4 was the most commonly used model (21%), and nearly 19% of papers did not specify which LLM was used.",
    298       "evidence": "Section V.E, Figure 5. Percentages stated without showing raw counts or the classification methodology.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Only 14.3% of the studies reported using fine-tuned models; most leveraged zero-shot or few-shot prompting.",
    303       "evidence": "Section V.D, Figure 4.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "LLMs can be integrated into SQA processes across all major software quality standards (ISO/IEC 12207, 25010, 5055, ISO 9001, CMMI, TMM) in a trustworthy and standards-aligned manner.",
    308       "evidence": "Table I provides a mapping of LLM applications to standards attributes, citing 200+ papers across categories. However, the survey makes no original experiments to validate these mappings.",
    309       "supported": "weak"
    310     }
    311   ],
    312   "methodology_tags": [
    313     "meta-analysis",
    314     "qualitative"
    315   ],
    316   "key_findings": "This survey of 223+ papers (2023-2025) finds that LLM-based techniques are being applied across all major software quality assurance domains including requirement validation, defect detection, test generation, and compliance checks. Publication volume has grown nearly fourfold from 2023 to 2025. GPT-4 dominates usage (21%), while 19% of papers failed to specify the LLM used and 30% did not specify their dataset, indicating widespread reproducibility gaps in the field. The paper maps LLM applications to six major software quality standards (ISO/IEC 12207, 25010, 5055, ISO 9001, CMMI, TMM) and identifies governance, explainability, and data privacy as key outstanding challenges.",
    317   "red_flags": [
    318     {
    319       "flag": "Informal search methodology without PRISMA or systematic review protocol",
    320       "detail": "Section IV lists 7 inclusion criteria but never describes which databases were searched, what search queries were used, how many papers were initially retrieved, or how papers were filtered to reach the final set of 223+. This is not a systematic review in any rigorous sense."
    321     },
    322     {
    323       "flag": "Single-author review without inter-rater reliability",
    324       "detail": "The entire paper selection and categorization was apparently performed by a single author (Avinash Patil) with no mention of a second reviewer or inter-rater reliability checks. Systematic reviews typically require at least two independent reviewers to control for selection and classification bias."
    325     },
    326     {
    327       "flag": "No quality assessment of reviewed papers",
    328       "detail": "The survey classifies papers by topic and counts them but applies no quality filter to the 223+ papers beyond the seven informal inclusion criteria. Weak or methodologically flawed papers are treated equally with rigorous ones, which launders their results into the survey's conclusions."
    329     },
    330     {
    331       "flag": "Broad unsupported generalizations from narrow evidence",
    332       "detail": "The abstract and conclusion claim LLMs can be integrated 'in a trustworthy, efficient, and standards-aligned manner' but the survey itself presents evidence that most studies lack reproducibility (30% no dataset, 19% no LLM specification). This contradicts the optimistic framing."
    333     },
    334     {
    335       "flag": "No competing interests or funding disclosure",
    336       "detail": "The paper lists affiliation with Juniper Networks Inc. but provides no statement on whether the research was funded, whether there are competing interests, or whether Juniper supported this work."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Evaluating large language models trained on code",
    342       "authors": [
    343         "M. Chen",
    344         "J. Tworek",
    345         "H. Jun"
    346       ],
    347       "year": 2021,
    348       "arxiv_id": "2107.03374",
    349       "relevance": "Foundational paper on evaluating LLMs for code (Codex/GitHub Copilot), directly relevant to LLM coding capability assessment."
    350     },
    351     {
    352       "title": "LLM-based test-driven interactive code generation: User study and empirical evaluation",
    353       "authors": [
    354         "S. Fakhoury",
    355         "A. Naik",
    356         "G. Sakkas",
    357         "S. Chakraborty",
    358         "S. K. Lahiri"
    359       ],
    360       "year": 2024,
    361       "relevance": "Empirical user study of LLM-based code generation combining test-driven development, directly relevant to LLM capability evaluation."
    362     },
    363     {
    364       "title": "Mutation-guided LLM-based test generation at Meta",
    365       "authors": [
    366         "C. Foster",
    367         "A. Gulati",
    368         "M. Harman"
    369       ],
    370       "year": 2025,
    371       "arxiv_id": "2501.12862",
    372       "relevance": "Industrial-scale study of LLM-based test generation at Meta, relevant to real-world LLM deployment in software engineering."
    373     },
    374     {
    375       "title": "AI-powered code review with LLMs: Early results",
    376       "authors": [
    377         "Z. Rasheed",
    378         "M. A. Sami",
    379         "M. Waseem"
    380       ],
    381       "year": 2024,
    382       "arxiv_id": "2404.18496",
    383       "relevance": "Empirical evaluation of LLM-based automated code review, directly relevant to LLM software quality applications."
    384     },
    385     {
    386       "title": "Advancing requirements engineering through generative AI: Assessing the role of LLMs",
    387       "authors": [
    388         "C. Arora",
    389         "J. Grundy",
    390         "M. Abdelrazek"
    391       ],
    392       "year": 2024,
    393       "relevance": "Study of LLMs for requirements engineering including practitioner feedback, relevant to LLM-assisted software development evaluation."
    394     },
    395     {
    396       "title": "Fine-tuning and prompt engineering for large language models-based code review automation",
    397       "authors": [
    398         "C. Pornprasit",
    399         "C. Tantithamthavorn"
    400       ],
    401       "year": 2024,
    402       "relevance": "Empirical comparison of fine-tuning vs. prompt engineering for LLM code review automation, directly relevant to LLM methodology evaluation."
    403     },
    404     {
    405       "title": "Large language models as configuration validators",
    406       "authors": [
    407         "X. Lian",
    408         "Y. Chen",
    409         "R. Cheng"
    410       ],
    411       "year": 2025,
    412       "relevance": "Evaluation of LLMs for software configuration validation, relevant to LLM software quality assurance applications."
    413     },
    414     {
    415       "title": "LLM-assisted static analysis for detecting security vulnerabilities",
    416       "authors": [
    417         "Z. Li",
    418         "S. Dutta",
    419         "M. Naik"
    420       ],
    421       "year": 2024,
    422       "arxiv_id": "2405.17238",
    423       "relevance": "Study of LLMs for security vulnerability detection via static analysis, directly relevant to LLM-based SQA."
    424     },
    425     {
    426       "title": "Enhancing large language models for text-to-testcase generation",
    427       "authors": [
    428         "S. Alagarsamy",
    429         "C. Tantithamthavorn",
    430         "C. Arora",
    431         "A. Aleti"
    432       ],
    433       "year": 2024,
    434       "arxiv_id": "2402.11910",
    435       "relevance": "Empirical study of LLMs for automated test case generation, relevant to LLM test automation capability assessment."
    436     },
    437     {
    438       "title": "An empirical study on the code refactoring capability of large language models",
    439       "authors": [
    440         "J. Cordeiro",
    441         "S. Noei",
    442         "Y. Zou"
    443       ],
    444       "year": 2024,
    445       "arxiv_id": "2411.02320",
    446       "relevance": "Empirical evaluation of LLM refactoring capability, directly relevant to LLM software quality assurance assessment."
    447     },
    448     {
    449       "title": "Expectations vs. experience: Evaluating the usability of code generation tools powered by large language models",
    450       "authors": [
    451         "P. Vaithilingam"
    452       ],
    453       "year": 2022,
    454       "relevance": "CHI user study on the usability of LLM-based code generation tools, relevant to human factors in LLM-assisted development."
    455     },
    456     {
    457       "title": "Industry-academia collaborations in software testing: experience and success stories from Canada",
    458       "authors": [
    459         "V. Garousi",
    460         "K. Petersen",
    461         "B. Ozkan"
    462       ],
    463       "year": 2019,
    464       "arxiv_id": "1904.04986",
    465       "relevance": "Empirical study of software testing practices, relevant to understanding software quality assessment methodology."
    466     }
    467   ]
    468 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs