scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17684B)
      1 {
      2   "paper": {
      3     "title": "AI-driven software engineering",
      4     "authors": ["Josh Mahmood Ali"],
      5     "year": 2023,
      6     "venue": "Applied and Computational Engineering",
      7     "doi": "10.54254/2977-3903/3/2023030"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code, repository URL, or archive link is provided anywhere in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper claims to source data from GitHub, Bitbucket, and GitLab (Section 3.1) and conduct 50 interviews (Table 4), but no dataset, project list, or interview data is released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependencies, or tools used for analysis are mentioned."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions or steps to replicate the analysis are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Section 3.3 claims 'statistical tests were performed' but the paper contains no results section and reports no numerical results, confidence intervals, or error bars of any kind."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims statistical tests were performed (Section 3.3) to compare AI-driven vs. non-AI projects, but no test results, p-values, or test statistics are reported anywhere."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. The conclusion mentions 'marked improvement in efficiency and user experience' without any quantification."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Table 4 lists 450 GitHub projects, 300 Bitbucket projects, and 50 interviews, but there is no justification for these sample sizes and no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or any measure of spread is reported. In fact, no quantitative results are reported at all."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Section 3.3 mentions comparing AI-driven projects against non-AI projects, but no actual comparison results are shown."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines or comparison results are reported, so contemporaneity cannot be assessed."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a survey/overview paper that does not propose a system with components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Section 3.3 mentions metrics like 'bug frequency, system uptime, and user feedback scores' but no actual metric values are reported anywhere in the paper."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper claims to have conducted 50 interviews (Table 4) but reports no structured results from these interviews beyond a vague summary in the conclusion that 'feedback from developers highlighted the need for better tooling.'"
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This paper does not train or evaluate any model on a dataset; it is a survey/overview study."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No per-category breakdown of results is provided. The paper has no results section at all."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "The conclusion vaguely mentions challenges around model interpretability and ethical considerations, but no specific failure cases from the claimed analysis are discussed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No negative results from the claimed experiments are reported. The paper goes from methodology directly to a positive conclusion."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The abstract claims 'the results indicate a promising uptrend in the integration of AI techniques in software development' and that the study 'offers a comprehensive view of the current landscape,' but the paper contains no results section and no data to support these claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The conclusion states 'Projects integrating AI methodologies displayed a marked improvement in efficiency and user experience,' which is a causal claim. No experimental design, confound analysis, or statistical evidence supports this claim."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes broad claims about AI-driven software engineering generally without bounding them to any specific domain, language, or tested setting. The title itself is unbounded."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations for any observed patterns are discussed. The paper does not consider confounds such as selection bias (projects that adopt AI may already be better-resourced)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This paper does not use or evaluate any specific AI/LLM model; it is a survey of AI-driven software engineering practices."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting of AI models is involved in this study."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI model experiments are run in this paper; it is a survey/overview study."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used in this paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 3.1 says 'Projects with AI components or dependencies were isolated for further analysis' but provides no criteria for what counts as 'AI components or dependencies,' no filtering pipeline, and no counts at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no limitations, threats to validity, or similar section in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No scope boundaries are stated. The paper does not specify what it does NOT claim or what settings are excluded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is available. The claimed 450 GitHub projects, 300 Bitbucket projects, and interview responses are not released or referenced."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 3.1 mentions sourcing data from GitHub, Bitbucket, and GitLab but provides no detail on search queries, time period, inclusion/exclusion criteria, or how 'AI components or dependencies' were identified."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Section 3.2 mentions 'structured interviews with software developers and AI specialists' but provides no information on how participants were recruited, from which companies or platforms, or any selection criteria."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No data pipeline is documented. The paper goes from mentioning data sources in Section 3 directly to conclusions in Section 4 with no intermediate processing steps described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning funding."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation is listed as Saint Leo University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is a NO per the schema guidance."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate any pre-trained model's capability on a benchmark. It is a survey/overview study."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "This paper does not evaluate any pre-trained model on a benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper does not evaluate any pre-trained model on a benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper claims 50 structured interviews (Table 4) but no pre-registration is mentioned."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper involves 50 human interview participants but no IRB or ethics board approval is mentioned."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "Table 4 describes interviewees as 'developers and AI experts' but no demographics (experience level, role, geographic distribution, etc.) are reported."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria for interview participants are stated."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "The interview study is qualitative/observational, not an experimental study requiring randomization of participants to conditions."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "Blinding is not applicable to a qualitative interview study."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No information on participant attrition is provided. The paper does not state whether all 50 interviews were completed or if any participants dropped out."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a survey/overview paper that does not propose or run an AI method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a survey/overview paper with no computational experiments to budget."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The adoption of AI in software development is a transformative shift, not a fleeting trend.",
    286       "evidence": "Stated in Section 4 (Conclusion) as a result of the 'multi-pronged methodology,' but no specific data or quantitative results are presented to support this claim.",
    287       "supported": "unsupported"
    288     },
    289     {
    290       "claim": "Projects integrating AI methodologies displayed a marked improvement in efficiency and user experience.",
    291       "evidence": "Stated in Section 4 (Conclusion). No quantitative results, comparison data, or statistical tests are presented despite Section 3.3 claiming such tests were performed.",
    292       "supported": "unsupported"
    293     },
    294     {
    295       "claim": "Challenges exist around model interpretability, ethical considerations, and the steep learning curve associated with integrating AI.",
    296       "evidence": "Mentioned in Section 4 (Conclusion) and the abstract. Supported qualitatively by the literature review in Section 2 (e.g., Arnold et al., 2019; Zhang et al., 2020) but not by any original empirical findings.",
    297       "supported": "weak"
    298     }
    299   ],
    300   "methodology_tags": [
    301     "qualitative",
    302     "observational"
    303   ],
    304   "key_findings": "This paper claims to study AI-driven software engineering using data from GitHub/Bitbucket repositories and 50 developer interviews, but provides no actual results section. The conclusion asserts that AI-driven projects show 'marked improvement in efficiency and user experience' and that challenges exist around model interpretability and ethics, but these conclusions lack any quantitative backing. The paper is essentially a literature review with unsubstantiated claims of original empirical work.",
    305   "red_flags": [
    306     {
    307       "flag": "Missing results section",
    308       "detail": "The paper describes a methodology (Section 3) involving quantitative analysis of 750 projects and 50 interviews, then jumps directly to conclusions (Section 4) without any results section. No tables, figures, or statistics from the claimed analysis are presented."
    309     },
    310     {
    311       "flag": "Claims significantly outrun evidence",
    312       "detail": "The conclusion states 'Projects integrating AI methodologies displayed a marked improvement in efficiency and user experience' but no data, metrics, or statistical results are shown to support this claim."
    313     },
    314     {
    315       "flag": "Phantom data",
    316       "detail": "Table 4 lists 450 GitHub projects, 300 Bitbucket projects, and 50 interviews as data sources, but no analysis of this data appears anywhere in the paper. There is no way to verify whether this data collection actually occurred."
    317     },
    318     {
    319       "flag": "No limitations discussion",
    320       "detail": "The paper contains no limitations section, no threats to validity, and no acknowledgment of study weaknesses despite making broad claims about AI-driven software engineering."
    321     },
    322     {
    323       "flag": "Missing IRB/ethics review for human subjects",
    324       "detail": "The paper claims to have interviewed 50 people but does not mention IRB approval, informed consent, or any ethics review process."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "Machine learning in software engineering: Models, methods, and applications",
    330       "authors": ["S. Wang", "J. Wen", "X. Wang", "R. Zhou"],
    331       "year": 2020,
    332       "relevance": "Survey of ML applications in software engineering, directly relevant to methodology quality assessment of review papers in this space."
    333     },
    334     {
    335       "title": "Whole test suite generation",
    336       "authors": ["G. Fraser", "A. Arcuri"],
    337       "year": 2013,
    338       "doi": "10.1109/TSE.2013.6",
    339       "relevance": "Foundational work on AI/genetic algorithms for automated test generation in software engineering."
    340     },
    341     {
    342       "title": "AI-driven software engineering: Challenges and future directions",
    343       "authors": ["T. Javdani", "A. Shamsaei", "M. Shahin"],
    344       "year": 2021,
    345       "relevance": "Discusses challenges and future directions for AI in software engineering, directly relevant to the survey scope."
    346     },
    347     {
    348       "title": "A survey on the application of machine learning in software engineering",
    349       "authors": ["P. Raschke", "J. Bartels", "V. Gruhn"],
    350       "year": 2019,
    351       "arxiv_id": "1905.13209",
    352       "relevance": "Survey of ML in software engineering, relevant for comparing survey methodology quality."
    353     },
    354     {
    355       "title": "Towards building a universal defect prediction model",
    356       "authors": ["F. Zhang", "A. Mockus", "I. Keivanloo", "Y. Zou"],
    357       "year": 2020,
    358       "relevance": "Addresses ML-driven defect prediction, a core application area of AI in software engineering."
    359     }
    360   ]
    361 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs